Octopus
debug.F90
Go to the documentation of this file.
1!! Copyright (C) 2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21module debug_oct_m
22 use global_oct_m
24 use mpi_oct_m
26 use loct_oct_m
27 use parser_oct_m
28
29 implicit none
30
31 private
32 public :: &
33 debug_t, &
34 debug_init, &
39 debug, &
44
45 type debug_t
46 private
47 logical, public :: info
48 logical, public :: trace
49 logical, public :: trace_term
50 logical, public :: trace_file
51 logical :: extra_checks
52 logical, public :: interaction_graph
53 logical, public :: interaction_graph_full
54 logical, public :: propagation_graph
55 logical, public :: instrument
56 integer :: bits
57 character(len=MAX_PATH_LEN), public :: instr_sub_name
58 integer, public :: instr_tool
59 end type debug_t
60
61 type(debug_t), save :: debug
62
64 integer, parameter :: unit_offset = 1000
65
66 interface
67 subroutine debug_verrou_start_instrumentation() bind(C)
69
70 subroutine debug_verrou_stop_instrumentation() bind(C)
72 end interface
73
74contains
75
76 subroutine debug_init(this, namespace)
77 type(debug_t), intent(out) :: this
78 type(namespace_t), intent(in) :: namespace
79
80 character(len=256) :: node_hook
81 logical :: file_exists, mpi_debug_hook
82 integer :: sec, usec
83 type(block_t) :: blk
84 integer :: line
85
86 !%Variable Debug
87 !%Type flag
88 !%Default no
89 !%Section Execution::Debug
90 !%Description
91 !% This variable controls the amount of debugging information
92 !% generated by Octopus. You can use include more than one option
93 !% with the + operator.
94 !%Option no 0
95 !% (default) <tt>Octopus</tt> does not enter debug mode.
96 !%Option info 1
97 !% Octopus prints additional information to the terminal.
98 !%Option trace 2
99 !% Octopus generates a stack trace as it enters end exits
100 !% subroutines. This information is reported if Octopus stops with
101 !% an error.
102 !%Option trace_term 4
103 !% The trace is printed to the terminal as Octopus enters or exits subroutines. This slows down execution considerably.
104 !%Option trace_file 8
105 !% The trace is written to files in the <tt>debug</tt>
106 !% directory. For each node (when running in parallel) there is a file called
107 !% <tt>debug_trace.&lt;rank&gt;</tt>. Writing these files slows down the code by a huge factor and
108 !% it is usually only necessary for parallel runs.
109 !%Option extra_checks 16
110 !% This enables Octopus to perform some extra checks, to ensure
111 !% code correctness, that might be too costly for regular runs.
112 !%Option interaction_graph 32
113 !% Octopus generates a dot file containing the graph for a multisystem run.
114 !%Option interaction_graph_full 64
115 !% Octopus generates a dot file containing the graph for a multisystem run including ghost interactions.
116 !%Option propagation_graph 128
117 !% Octopus generates a file with information for the propagation diagram.
118 !%Option instrument 256
119 !% Octopus adds instrumentation to functions specified in an <tt>InstrumentFunctions</tt> block.
120 !%End
121 call parse_variable(namespace, 'Debug', option__debug__no, this%bits)
122
123 call from_bits(this)
124
125 !%Variable InstrumentFunctions
126 !%Type block
127 !%Section Execution::Debug
128 !%Description
129 !% This input options controls which routines are going to be instrumented
130 !% for the tools selected using the <tt>Debug</tt> option.
131 !%
132 !% <br>%<tt>InstrumentFunctions
133 !% <br>&nbsp;&nbsp;function_name | instrumentation_tool
134 !% <br>%</tt>
135 !%
136 !% Here is an example to better understand how this works:
137 !%
138 !% <br>%<tt>InstrumentFunctions
139 !% <br>&nbsp;&nbsp;grid/grid.F90/grid_init_from_grid_stage_1 | verrou
140 !% <br>%</tt>
141 !%
142 !% NOTE: Currently only a single function can be instrumented!
143 !%
144 !% Available instrumentation tools:
145 !%Option verrou 1
146 !% Verrou helps you look for floating-point round-off errors.
147 !%End
148 if (parse_block(namespace, "InstrumentFunctions", blk) == 0) then
149 ! TODO: Allow instrumentation of more than a single function
150 if (parse_block_n(blk) .gt. 1) then
151 write(stderr,'(a)') "Only single function can be instrumented!"
152 call mpi_world%abort()
153 call loct_exit_failure()
154 end if
155
156 do line = 0, parse_block_n(blk) - 1
157 call parse_block_string(blk, line, 0, this%instr_sub_name)
158 call parse_block_integer(blk, line, 1, this%instr_tool)
159 select case (this%instr_tool)
160 case (option__instrumentfunctions__verrou)
161 write(stderr,'(a)') "Instrumenting " // trim(this%instr_sub_name) // " for Verrou"
162#if !defined(HAVE_VERROU)
163 write(stderr,'(a)') "requires VERROU but that library was not linked."
164 call mpi_world%abort()
165 call loct_exit_failure()
166#endif
167 case default
168 assert(.false.) ! Should not happen
169 end select
170 end do
171 call parse_block_end(blk)
172 end if
173
174 call mpi_debug_init(mpi_world%rank, this%info)
175
176 if (this%info) then
177 !%Variable MPIDebugHook
178 !%Type logical
179 !%Default no
180 !%Section Execution::Debug
181 !%Description
182 !% When debugging the code in parallel it is usually difficult to find the origin
183 !% of race conditions that appear in MPI communications. This variable introduces
184 !% a facility to control separate MPI processes. If set to yes, all nodes will
185 !% start up, but will get trapped in an endless loop. In every cycle of the loop
186 !% each node is sleeping for one second and is then checking if a file with the
187 !% name <tt>node_hook.xxx</tt> (where <tt>xxx</tt> denotes the node number) exists. A given node can
188 !% only be released from the loop if the corresponding file is created. This allows
189 !% to selectively run, <i>e.g.</i>, a compute node first followed by the master node. Or, by
190 !% reversing the file creation of the node hooks, to run the master first followed
191 !% by a compute node.
192 !%End
193 call parse_variable(global_namespace, 'MPIDebugHook', .false., mpi_debug_hook)
194 if (mpi_debug_hook) then
195 call loct_gettimeofday(sec, usec)
196 call epoch_time_diff(sec,usec)
197 write(stdout,'(a,i6,a,i6.6,20x,a)') '* I ',sec,'.',usec,' | MPI debug hook'
198
199 write(stdout,'(a,i3,a)') 'node:', mpi_world%rank, ' In debug hook'
200 write(node_hook,'(i3.3)') mpi_world%rank
201 file_exists = .false.
202
203 do while (.not. file_exists)
204 inquire(file='node_hook.'//node_hook, exist=file_exists)
205 call loct_nanosleep(1,0)
206 write(stdout,'(a,i3,a)') 'node:', mpi_world%rank, &
207 ' - still sleeping. To release me touch: node_hook.'//trim(node_hook)
208 end do
209
210 write(stdout,'(a,i3,a)') 'node:', mpi_world%rank, ' Leaving debug hook'
211 ! remove possible debug hooks
212 call loct_rm('node_hook.'//trim(node_hook))
213
214 call loct_gettimeofday(sec, usec)
215 call epoch_time_diff(sec,usec)
216 write(stdout,'(a,i6,a,i6.6,20x,a)') '* O ', sec, '.', usec,' | MPI debug hook'
217 end if
218 end if
219
220 end subroutine debug_init
221
222 !--------------------------------------------------
223
224 subroutine debug_enable(this)
225 type(debug_t), intent(inout) :: this
226
227 this%info = .true.
228 this%trace = .true.
229 this%trace_term = .true.
230 this%trace_file = .true.
231 this%interaction_graph = .true.
232 this%interaction_graph_full = .true.
233 this%propagation_graph = .true.
234
235 end subroutine debug_enable
236
237 !--------------------------------------------------
238
239 subroutine debug_disable(this)
240 type(debug_t), intent(inout) :: this
241
242 call from_bits(this)
243
244 end subroutine debug_disable
245
246 !--------------------------------------------------
247
248 subroutine debug_delete_trace()
249
250 integer :: iunit
251 character(len=6) :: filenum
252
253 iunit = mpi_world%rank + unit_offset
254 write(filenum, '(i6.6)') iunit - unit_offset
255 call loct_mkdir('debug')
256 call loct_rm('debug/debug_trace.node.'//filenum)
257
258 end subroutine debug_delete_trace
259
260 ! ---------------------------------------------------------
261
262 subroutine debug_open_trace(iunit)
263 integer, intent(out) :: iunit
264
265 character(len=6) :: filenum
266
267 iunit = mpi_world%rank + unit_offset
268 write(filenum, '(i6.6)') iunit - unit_offset
269 call loct_mkdir('debug')
270 open(iunit, file = 'debug/debug_trace.node.'//filenum, &
271 action='write', status='unknown', position='append')
272
273 end subroutine debug_open_trace
274
275 ! ---------------------------------------------------------
276
277 subroutine from_bits(this)
278 type(debug_t), intent(inout) :: this
279
280 this%info = (bitand(this%bits, option__debug__info) /= 0)
281 this%trace_term = (bitand(this%bits, option__debug__trace_term) /= 0)
282 this%trace_file = (bitand(this%bits, option__debug__trace_file) /= 0)
283 this%trace = (bitand(this%bits, option__debug__trace) /= 0) .or. this%trace_term .or. this%trace_file
284 this%extra_checks = (bitand(this%bits, option__debug__extra_checks) /= 0) .or. this%trace_term .or. this%trace_file
285 this%interaction_graph = (bitand(this%bits, option__debug__interaction_graph) /= 0)
286 this%interaction_graph_full = (bitand(this%bits, option__debug__interaction_graph_full) /= 0)
287 this%propagation_graph = (bitand(this%bits, option__debug__propagation_graph) /= 0)
288 this%instrument = (bitand(this%bits, option__debug__instrument) /= 0)
289
290 end subroutine from_bits
291
292
293 ! ---------------------------------------------------------
294 subroutine epoch_time_diff(sec, usec)
295 integer, intent(inout) :: sec
296 integer, intent(inout) :: usec
297
298 ! this is called by push/pop so there cannot be a push/pop in this routine
299
300 call time_diff(s_epoch_sec, s_epoch_usec, sec, usec)
301 end subroutine epoch_time_diff
302
303
304 ! ---------------------------------------------------------
307 subroutine time_diff(sec1, usec1, sec2, usec2)
308 integer, intent(in) :: sec1
309 integer, intent(in) :: usec1
310 integer, intent(inout) :: sec2
311 integer, intent(inout) :: usec2
312
313 ! this is called by push/pop so there cannot be a push/pop in this routine
314
315 ! Correct overflow.
316 if (usec2 - usec1 < 0) then
317 usec2 = 1000000 + usec2
318 if (sec2 >= sec1) then
319 sec2 = sec2 - 1
320 end if
321 end if
322
323 ! Replace values.
324 if (sec2 >= sec1) then
325 sec2 = sec2 - sec1
326 end if
327 usec2 = usec2 - usec1
328
329 end subroutine time_diff
330
331
332#ifndef NDEBUG
333 ! ---------------------------------------------------------
335 subroutine debug_push_sub(sub_name)
336 character(len=*), intent(in) :: sub_name
337
338 integer, parameter :: max_recursion_level = 50
339 integer iunit, sec, usec
340
341 if (debug%instrument) then
342 if (debug_clean_path(sub_name) == trim(debug%instr_sub_name)) then
343 select case (debug%instr_tool)
344 case (option__instrumentfunctions__verrou)
346 case default
347 assert(.false.) ! cannot happen
348 end select
349 end if
350 end if
351
352 if (.not. debug%trace) return
353
354 call loct_gettimeofday(sec, usec)
355 call epoch_time_diff(sec, usec)
356
358 if (no_sub_stack >= max_recursion_level) then
359 sub_stack(max_recursion_level) = 'debug_push_sub'
360 write(stderr, '(a,i3,a)') 'Too many recursion levels in debug trace (max=', max_recursion_level, ')'
361 call mpi_world%abort()
362 stop
363 end if
364
365 sub_stack(no_sub_stack) = trim(debug_clean_path(sub_name))
367
368 if (debug%trace_file) then
369 call debug_open_trace(iunit)
370 call push_sub_write(iunit)
371 ! close file to ensure flushing
372 close(iunit)
373 end if
374
375 if (debug%trace_term .and. mpi_grp_is_root(mpi_world)) then
376 ! write to stderr if we are node 0
377 call push_sub_write(stderr)
378 end if
379
380 contains
381
382 subroutine push_sub_write(iunit_out)
383 integer, intent(in) :: iunit_out
384
385 integer :: ii
386 character(len=1000) :: tmpstr
388 write(tmpstr,'(a,i6,a,i6.6,f20.6,i8,a)') "* I ", &
389 sec, '.', usec, &
390 loct_clock(), &
391 loct_get_memory_usage() / 1024, " | "
392 do ii = no_sub_stack - 1, 1, -1
393 write(tmpstr, '(2a)') trim(tmpstr), "..|"
394 end do
395 write(tmpstr, '(2a)') trim(tmpstr), trim(debug_clean_path(sub_name))
396 write(iunit_out, '(a)') trim(tmpstr)
397
398 end subroutine push_sub_write
399
400 end subroutine debug_push_sub
401
402 ! ---------------------------------------------------------
404 subroutine debug_pop_sub(sub_name)
405 character(len=*), intent(in) :: sub_name
406
407 character(len=80) :: sub_name_short
408 integer iunit, sec, usec
409
410 if (debug%instrument) then
411 if (debug_clean_path(sub_name) == trim(debug%instr_sub_name)) then
412 select case (debug%instr_tool)
413 case (option__instrumentfunctions__verrou)
415 case default
416 assert(.false.) ! cannot happen
417 end select
418 end if
419 end if
420
421 if (.not. debug%trace) return
422
423 call loct_gettimeofday(sec, usec)
424 call epoch_time_diff(sec, usec)
425
426 if (no_sub_stack <= 0) then
427 no_sub_stack = 1
428 sub_stack(1) = 'pop_sub'
429 write(stderr, '(a)') 'Too few recursion levels in debug trace'
430 call mpi_world%abort()
431 stop
432 end if
433
434 ! the name might be truncated in sub_stack, so we copy to a string
435 ! of the same size
436 sub_name_short = trim(debug_clean_path(sub_name))
437
438 if (sub_name_short /= sub_stack(no_sub_stack)) then
439 write(stderr, '(a)') 'Wrong sub name on pop_sub :'
440 write(stderr, '(2a)') ' got : ', sub_name_short
441 write(stderr, '(2a)') ' expected : ', sub_stack(no_sub_stack)
442 call mpi_world%abort()
443 stop
444 end if
445
446 if (debug%trace_file) then
447 call debug_open_trace(iunit)
448 call pop_sub_write(iunit)
449 ! close file to ensure flushing
450 close(iunit)
451 end if
452
453 if (debug%trace_term .and. mpi_grp_is_root(mpi_world)) then
454 ! write to stderr if we are node 0
455 call pop_sub_write(stderr)
456 end if
457
459
460 contains
461
462 subroutine pop_sub_write(iunit_out)
463 integer, intent(in) :: iunit_out
464
465 integer :: ii
466 character(len=1000) :: tmpstr
467
468 write(tmpstr,'(a,i6,a,i6.6,f20.6,i8, a)') "* O ", &
469 sec, '.', usec, &
471 loct_get_memory_usage() / 1024, " | "
472 do ii = no_sub_stack - 1, 1, -1
473 write(tmpstr,'(2a)') trim(tmpstr), "..|"
474 end do
475 write(tmpstr,'(2a)') trim(tmpstr), trim(sub_stack(no_sub_stack))
476
477 write(iunit_out, '(a)') trim(tmpstr)
478
479 end subroutine pop_sub_write
480
481 end subroutine debug_pop_sub
482#endif
483
484 ! -----------------------------------------------------------
486 character(len=MAX_PATH_LEN) function debug_clean_path(filename) result(clean_path)
487 character(len=*), intent(in) :: filename
488
489 integer :: pos
490
491 pos = index(filename, 'src/', back = .true.)
492 if (pos == 0) then
493 ! 'src/' does not occur
494 clean_path = filename
495 else
496 ! remove 'src/'
497 clean_path = filename(pos+4:)
498 end if
499
500 end function debug_clean_path
501
502end module debug_oct_m
503
504!! Local Variables:
505!! mode: f90
506!! coding: utf-8
507!! End:
subroutine pop_sub_write(iunit_out)
Definition: debug.F90:556
subroutine push_sub_write(iunit_out)
Definition: debug.F90:476
File-handling.
Definition: loct.F90:216
character(len=max_path_len) function, public debug_clean_path(filename)
Prune a filename path to only include subdirectories of the "src" directory.
Definition: debug.F90:580
subroutine, public debug_enable(this)
Definition: debug.F90:318
type(debug_t), save, public debug
Definition: debug.F90:154
subroutine, public debug_pop_sub(sub_name)
Pop a routine from the debug trace.
Definition: debug.F90:498
subroutine, public debug_open_trace(iunit)
Definition: debug.F90:356
subroutine, public epoch_time_diff(sec, usec)
Definition: debug.F90:388
subroutine from_bits(this)
Definition: debug.F90:371
subroutine, public debug_init(this, namespace)
Definition: debug.F90:170
subroutine, public debug_disable(this)
Definition: debug.F90:333
subroutine time_diff(sec1, usec1, sec2, usec2)
Computes t2 <- t2-t1. sec1,2 and usec1,2 are seconds,microseconds of t1,2.
Definition: debug.F90:401
subroutine, public debug_push_sub(sub_name)
Push a routine to the debug trace.
Definition: debug.F90:429
subroutine, public debug_delete_trace()
Definition: debug.F90:342
integer, public s_epoch_sec
global epoch time (time at startup)
Definition: global.F90:234
integer, public no_sub_stack
Definition: global.F90:239
real(real64), dimension(50), public time_stack
Definition: global.F90:238
character(len=80), dimension(50), public sub_stack
The stack.
Definition: global.F90:237
integer, public s_epoch_usec
Definition: global.F90:234
subroutine, public mpi_debug_init(rank, info)
Definition: mpi_debug.F90:182
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
Definition: mpi.F90:430
type(mpi_grp_t), public mpi_world
Definition: mpi.F90:266
type(namespace_t), public global_namespace
Definition: namespace.F90:132
integer function, public parse_block(namespace, name, blk, check_varinfo_)
Definition: parser.F90:618
int true(void)