Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_CUDA)
22#define HAVE_ACCEL 1
23#endif
24
25module accel_oct_m
27 use cuda_oct_m
28 use debug_oct_m
29 use global_oct_m
30 use iso_c_binding, only: c_size_t
31 use, intrinsic :: iso_fortran_env
32 use loct_oct_m
33 use math_oct_m
35 use mpi_oct_m
37 use types_oct_m
38 use parser_oct_m
42 use string_oct_m
43
44 implicit none
45
46 private
47
48 public :: &
53 accel_t, &
56 accel_init, &
57 accel_end, &
87
88 integer, public, parameter :: &
89 ACCEL_MEM_READ_ONLY = 0, &
92
94 ! Components are public by default
95#if defined(HAVE_CUDA)
96 type(c_ptr) :: cuda_context
97#else
98 integer :: dummy
99#endif
100 end type accel_context_t
101
102 type accel_device_t
103 ! Components are public by default
104#if defined(HAVE_CUDA)
105 type(c_ptr) :: cuda_device
106#else
107 integer :: dummy
108#endif
109 end type accel_device_t
110
111 type accel_t
112 ! Components are public by default
113 type(accel_context_t) :: context
114 type(accel_device_t) :: device
115 type(c_ptr) :: cublas_handle
116 type(c_ptr) :: cuda_stream
117 type(c_ptr) :: module_map
118 integer :: max_workgroup_size
119 integer(int64) :: local_memory_size
120 integer(int64) :: global_memory_size
121 logical :: enabled
122 logical :: allow_CPU_only
123 logical :: shared_mem
124 logical :: cuda_mpi
125 integer :: warp_size
126 integer(int64) :: initialize_buffers
127 character(len=32) :: debug_flag
128 integer(int64) :: max_block_dim(3)
129 integer(int64) :: max_grid_dim(3)
130 end type accel_t
131
132 type accel_mem_t
133 ! Components are public by default
134 type(c_ptr) :: mem
135 integer(c_size_t) :: size = 0
136 type(type_t) :: type
137 integer :: flags = 0
138 logical :: allocated = .false.
139 end type accel_mem_t
140
141 type accel_kernel_t
142 ! Components are public by default
143#ifdef HAVE_CUDA
144 type(c_ptr) :: cuda_kernel
145 type(c_ptr) :: cuda_module
146 type(c_ptr) :: arguments
147#endif
148 integer(int64) :: cuda_shared_mem
149 logical :: initialized = .false.
150 type(accel_kernel_t), pointer :: next
151 integer :: arg_count
152 character(len=128) :: kernel_name
153 end type accel_kernel_t
154
155 type(accel_t), public :: accel
156
157 ! Global variables defined on device
158 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
159 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
160
161 ! the kernels
162 type(accel_kernel_t), public, target, save :: kernel_vpsi
163 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
164 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
165 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
166 type(accel_kernel_t), public, target, save :: kernel_daxpy
167 type(accel_kernel_t), public, target, save :: kernel_zaxpy
168 type(accel_kernel_t), public, target, save :: kernel_copy
169 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
170 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
171 type(accel_kernel_t), public, target, save :: dpack
172 type(accel_kernel_t), public, target, save :: zpack
173 type(accel_kernel_t), public, target, save :: dunpack
174 type(accel_kernel_t), public, target, save :: zunpack
175 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
176 type(accel_kernel_t), public, target, save :: kernel_density_real
177 type(accel_kernel_t), public, target, save :: kernel_density_complex
178 type(accel_kernel_t), public, target, save :: kernel_density_spinors
179 type(accel_kernel_t), public, target, save :: kernel_phase
180 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
181 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
182 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
183 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
184 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
185 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
186 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
187 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
188 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
189 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
190 type(accel_kernel_t), public, target, save :: dzmul
191 type(accel_kernel_t), public, target, save :: zzmul
192
195 end interface accel_padded_size
196
199 end interface accel_create_buffer
200
201 interface accel_kernel_run
203 end interface accel_kernel_run
204
207 end interface accel_set_buffer_to_zero
232 end interface accel_write_buffer
234 interface accel_read_buffer
249 end interface accel_read_buffer
251 interface accel_set_kernel_arg
252 module procedure &
277 module procedure &
285 module procedure &
291
292 integer :: buffer_alloc_count
293 integer(int64) :: allocated_mem
294 type(accel_kernel_t), pointer :: head
295 type(alloc_cache_t) :: memcache
297contains
298
299 pure logical function accel_is_enabled() result(enabled)
300#ifdef HAVE_ACCEL
301 enabled = accel%enabled
302#else
303 enabled = .false.
304#endif
305 end function accel_is_enabled
306
307 ! ------------------------------------------
308
309 pure logical function accel_allow_cpu_only() result(allow)
310#ifdef HAVE_ACCEL
311 allow = accel%allow_CPU_only
312#else
313 allow = .true.
314#endif
315 end function accel_allow_cpu_only
316
317 ! ------------------------------------------
318
319 subroutine accel_init(base_grp, namespace)
320 type(mpi_grp_t), intent(inout) :: base_grp
321 type(namespace_t), intent(in) :: namespace
322
323 logical :: disable, default, run_benchmark
324 integer :: idevice
325#ifdef HAVE_CUDA
326 integer :: dim
327#ifdef HAVE_MPI
328 character(len=256) :: sys_name
329#endif
330#endif
331
332 push_sub(accel_init)
333
334 buffer_alloc_count = 0
335
336 !%Variable DisableAccel
337 !%Type logical
338 !%Default yes
339 !%Section Execution::Accel
340 !%Description
341 !% If Octopus was compiled with CUDA support, it will
342 !% try to initialize and use an accelerator device. By setting this
343 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
344 !%End
345 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
346#ifdef HAVE_ACCEL
347 default = .false.
348#else
349 default = .true.
350#endif
351 call parse_variable(namespace, 'DisableAccel', default, disable)
352 accel%enabled = .not. disable
353
354#ifndef HAVE_ACCEL
355 if (accel%enabled) then
356 message(1) = 'Octopus was compiled without Cuda support.'
357 call messages_fatal(1)
358 end if
359#endif
360
361 if (.not. accel_is_enabled()) then
362 pop_sub(accel_init)
363 return
364 end if
365
366 call messages_obsolete_variable(namespace, 'AccelPlatform')
367 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
368
369 !%Variable AccelDevice
370 !%Type integer
371 !%Default 0
372 !%Section Execution::Accel
373 !%Description
374 !% This variable selects the GPU that Octopus will use. You can specify a
375 !% numerical id to select a specific device.
376 !%
377 !% In case of MPI enabled runs devices are distributed in a round robin fashion,
378 !% starting at this value.
379 !%End
380 call parse_variable(namespace, 'AccelDevice', 0, idevice)
381
382 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
383
384 if (idevice < 0) then
385 call messages_write('Invalid AccelDevice')
386 call messages_fatal()
387 end if
389 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
391#ifdef HAVE_CUDA
392 if (idevice<0) idevice = 0
393 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
394 idevice, base_grp%rank)
395#ifdef HAVE_MPI
396 call loct_sysname(sys_name)
397 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
398 " on ", trim(sys_name)
399 call messages_info(1, all_nodes = .true.)
400#endif
401
402 accel%shared_mem = .true.
403
404 call cublas_init(accel%cublas_handle, accel%cuda_stream)
405#endif
406
407
408 ! Get some device information that we will need later
409#ifdef HAVE_CUDA
410 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
411 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
412 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
413 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
414 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
415 accel%max_block_dim(1) = int(dim, int64)
416 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
417 accel%max_block_dim(2) = int(dim, int64)
418 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
419 accel%max_block_dim(3) = int(dim, int64)
420 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
421 accel%max_grid_dim(1) = int(dim, int64)
422 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
423 accel%max_grid_dim(2) = int(dim, int64)
424 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
425 accel%max_grid_dim(3) = int(dim, int64)
426#endif
427
428 if (base_grp%is_root()) call device_info()
429
430 ! initialize the cache used to speed up allocations
431 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
432
433 ! now initialize the kernels
435
436#if defined(HAVE_HIP)
437 accel%debug_flag = "-g"
438#elif defined(HAVE_CUDA)
439 accel%debug_flag = "-lineinfo"
440#endif
441
442 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cu', "vpsi")
443 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cu', "vpsi_complex")
444 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cu', "vpsi_spinors")
445 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cu', "vpsi_spinors_complex")
446 call accel_kernel_start_call(kernel_daxpy, 'axpy.cu', "daxpy", flags = '-DRTYPE_DOUBLE')
447 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cu', "zaxpy", flags = '-DRTYPE_COMPLEX')
448 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cu', "dbatch_axpy_function", &
449 flags = ' -DRTYPE_DOUBLE')
450 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cu', "zbatch_axpy_function", &
451 flags = '-DRTYPE_COMPLEX')
452 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cu', "dbatch_ax_function_py", &
453 flags = '-DRTYPE_DOUBLE')
454 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cu', "zbatch_ax_function_py", &
455 flags = '-DRTYPE_COMPLEX')
456 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cu', "dbatch_mf_dotp")
457 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cu', "zbatch_mf_dotp")
458 call accel_kernel_start_call(dpack, 'pack.cu', "dpack")
459 call accel_kernel_start_call(zpack, 'pack.cu', "zpack")
460 call accel_kernel_start_call(dunpack, 'pack.cu', "dunpack")
461 call accel_kernel_start_call(zunpack, 'pack.cu', "zunpack")
462 call accel_kernel_start_call(kernel_copy, 'copy.cu', "copy")
463 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cu', "copy_complex_to_real")
464 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cu', "copy_real_to_complex")
465 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cu', "ghost_reorder")
466 call accel_kernel_start_call(kernel_density_real, 'density.cu', "density_real")
467 call accel_kernel_start_call(kernel_density_complex, 'density.cu', "density_complex")
468 call accel_kernel_start_call(kernel_density_spinors, 'density.cu', "density_spinors")
469 call accel_kernel_start_call(kernel_phase, 'phase.cu', "phase")
470 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cu', "ddot_matrix")
471 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cu', "zdot_matrix")
472 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cu', "zdot_matrix_spinors")
473
474
475 call accel_kernel_start_call(dzmul, 'mul.cu', "dzmul", flags = '-DRTYPE_DOUBLE')
476 call accel_kernel_start_call(zzmul, 'mul.cu', "zzmul", flags = '-DRTYPE_COMPLEX')
477
478 ! Define global buffers
479 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
480 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
481 call accel_write_buffer(zm_0_buffer, m_z0)
482 end if
483 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
484 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
485 call accel_write_buffer(zm_1_buffer, m_z1)
486 end if
487 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
488 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
489 call accel_write_buffer(dm_0_buffer, m_zero)
490 end if
491 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
492 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
493 call accel_write_buffer(dm_1_buffer, m_one)
494 end if
495
496
497 !%Variable AccelBenchmark
498 !%Type logical
499 !%Default no
500 !%Section Execution::Accel
501 !%Description
502 !% If this variable is set to yes, Octopus will run some
503 !% routines to benchmark the performance of the accelerator device.
504 !%End
505 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
506
507 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
508
509 if (run_benchmark) then
511 end if
512
513 !%Variable GPUAwareMPI
514 !%Type logical
515 !%Section Execution::Accel
516 !%Description
517 !% If Octopus was compiled with GPU support and MPI support and if the MPI
518 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
519 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
520 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
521 !% going through the host memory.
522 !% The default is false, except when the configure switch --enable-cudampi is set, in which
523 !% case this variable is set to true.
524 !%End
525#ifdef HAVE_CUDA_MPI
526 default = .true.
527#else
528 default = .false.
529#endif
530 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
531 if (accel%cuda_mpi) then
532#ifndef HAVE_CUDA_MPI
533 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
534 call messages_warning()
535#endif
536 call messages_write("Using GPU-aware MPI.")
537 call messages_info()
538 end if
539
540
541 !%Variable AllowCPUonly
542 !%Type logical
543 !%Section Execution::Accel
544 !%Description
545 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
546 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
547 !% code execution also in these cases.
548 !%End
549#if defined (HAVE_ACCEL)
550 default = .false.
551#else
552 default = .true.
553#endif
554 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
555
556
557 !%Variable InitializeGPUBuffers
558 !%Type integer
559 !%Default no
560 !%Section Execution::Accel
561 !%Description
562 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
563 !%Option no 0
564 !% Do not initialize GPU buffers.
565 !%Option yes 1
566 !% Initialize GPU buffers to zero.
567 !%Option nan 2
568 !% Initialize GPU buffers to nan.
569 !%End
570 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
571 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
572 call messages_input_error(namespace, 'InitializeGPUBuffers')
573 end if
574
575
576 call messages_print_with_emphasis(namespace=namespace)
577
578 pop_sub(accel_init)
579
580 contains
581
582 subroutine device_info()
583#ifdef HAVE_CUDA
584 integer :: version
585 character(kind=c_char) :: cval_str(257)
586#endif
587 integer :: major, minor
588 character(len=256) :: val_str
589
590 push_sub(accel_init.device_info)
591
592 call messages_new_line()
593 call messages_write('Selected device:')
594 call messages_new_line()
595
596#ifdef HAVE_CUDA
597#ifdef __HIP_PLATFORM_AMD__
598 call messages_write(' Framework : ROCm')
599#else
600 call messages_write(' Framework : CUDA')
601#endif
602#endif
603 call messages_info()
604
605#ifdef HAVE_CUDA
606 call messages_write(' Device type : GPU', new_line = .true.)
607#ifdef __HIP_PLATFORM_AMD__
608 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
609#else
610 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
611#endif
612#endif
613
614#ifdef HAVE_CUDA
615 cval_str = c_null_char
616 call cuda_device_name(accel%device%cuda_device, cval_str)
617 call string_c_to_f(cval_str, val_str)
618#endif
619 call messages_write(' Device name : '//trim(val_str))
620 call messages_new_line()
621
622#ifdef HAVE_CUDA
623 call cuda_device_capability(accel%device%cuda_device, major, minor)
624#endif
625 call messages_write(' Cuda capabilities :')
626 call messages_write(major, fmt = '(i2)')
627 call messages_write('.')
628 call messages_write(minor, fmt = '(i1)')
629 call messages_new_line()
630
631 ! VERSION
632#ifdef HAVE_CUDA
633 call cuda_driver_version(version)
634 call messages_write(' Driver version : ')
635 call messages_write(version)
636#endif
637 call messages_new_line()
638
639
640 call messages_write(' Device memory :')
641 call messages_write(accel%global_memory_size, units=unit_megabytes)
643
644 call messages_write(' Local/shared memory :')
645 call messages_write(accel%local_memory_size, units=unit_kilobytes)
646 call messages_new_line()
647
648 call messages_write(' Max. group/block size :')
649 call messages_write(accel%max_workgroup_size)
650 call messages_new_line()
651
652 call messages_info()
653
654 pop_sub(accel_init.device_info)
655 end subroutine device_info
656
657 end subroutine accel_init
658
659 ! ------------------------------------------
660 subroutine accel_end(namespace)
661 type(namespace_t), intent(in) :: namespace
662
663 integer(int64) :: hits, misses
664 real(real64) :: volume_hits, volume_misses
665 logical :: found
666 type(accel_mem_t) :: tmp
667
668 push_sub(accel_end)
669
670 if (accel_is_enabled()) then
671
672 ! Release global buffers
673 call accel_release_buffer(zm_0_buffer)
674 call accel_release_buffer(zm_1_buffer)
675 call accel_release_buffer(dm_0_buffer)
676 call accel_release_buffer(dm_1_buffer)
677
678 do
679 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
680 if (.not. found) exit
681
682#ifdef HAVE_CUDA
683 call cuda_mem_free(tmp%mem)
684#endif
685 end do
686
687 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
688
689 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
690
691 call messages_new_line()
692 call messages_write(' Number of allocations =')
693 call messages_write(hits + misses, new_line = .true.)
694 call messages_write(' Volume of allocations =')
695 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
696 new_line = .true.)
697 call messages_write(' Hit ratio =')
698 if (hits + misses > 0) then
699 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
700 else
701 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
702 end if
703 call messages_write('%', new_line = .true.)
704 call messages_write(' Volume hit ratio =')
705 if (volume_hits + volume_misses > 0) then
706 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
707 else
708 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
709 end if
710 call messages_write('%')
711 call messages_new_line()
712 call messages_info()
713
714 call messages_print_with_emphasis(namespace=namespace)
715 end if
716
718
719 if (accel_is_enabled()) then
720#ifdef HAVE_CUDA
721 call cublas_end(accel%cublas_handle)
722 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
723 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
724 end if
725#endif
726
727 if (buffer_alloc_count /= 0) then
728 call messages_write('Accel:')
729 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
730 call messages_write(' in ')
731 call messages_write(buffer_alloc_count)
732 call messages_write(' buffers were not deallocated.')
733 call messages_fatal()
734 end if
735
736 end if
737
738 pop_sub(accel_end)
739 end subroutine accel_end
740
741 ! ------------------------------------------
742
743 integer(int64) function accel_padded_size_i8(nn) result(psize)
744 integer(int64), intent(in) :: nn
745
746 integer(int64) :: modnn, bsize
747
748 psize = nn
749
750 if (accel_is_enabled()) then
751
753
754 psize = nn
755 modnn = mod(nn, bsize)
756 if (modnn /= 0) psize = psize + bsize - modnn
757
758 end if
759
760 end function accel_padded_size_i8
761
762 ! ------------------------------------------
763
764 integer(int32) function accel_padded_size_i4(nn) result(psize)
765 integer(int32), intent(in) :: nn
766
767 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
768
769 end function accel_padded_size_i4
770
771 ! ------------------------------------------
772
773 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
774 type(accel_mem_t), intent(inout) :: this
775 integer, intent(in) :: flags
776 type(type_t), intent(in) :: type
777 integer, intent(in) :: size
778 logical, optional, intent(in) :: set_zero
779 logical, optional, intent(in) :: async
780
781 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
783
784 ! ------------------------------------------
785
786 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
787 type(accel_mem_t), intent(inout) :: this
788 integer, intent(in) :: flags
789 type(type_t), intent(in) :: type
790 integer(int64), intent(in) :: size
791 logical, optional, intent(in) :: set_zero
792 logical, optional, intent(in) :: async
793
794 integer(int64) :: fsize
795 logical :: found
796 integer(int64) :: initialize_buffers
797
798 push_sub(accel_create_buffer_8)
799
800 this%type = type
801 this%size = size
802 this%flags = flags
803 fsize = int(size, int64)*types_get_size(type)
804 this%allocated = .true.
805
806 if (fsize > 0) then
807
808 call alloc_cache_get(memcache, fsize, found, this%mem)
809
810 if (.not. found) then
811#ifdef HAVE_CUDA
812 if(optional_default(async, .false.)) then
813 call cuda_mem_alloc_async(this%mem, fsize)
814 else
815 call cuda_mem_alloc(this%mem, fsize)
816 end if
817#endif
818 end if
819
820 buffer_alloc_count = buffer_alloc_count + 1
821 allocated_mem = allocated_mem + fsize
822
823 end if
824
825 if (present(set_zero)) then
826 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
827 else
828 initialize_buffers = accel%initialize_buffers
829 end if
830 select case (initialize_buffers)
831 case (option__initializegpubuffers__yes)
832 call accel_set_buffer_to(this, type, int(z'00', int8), size)
833 case (option__initializegpubuffers__nan)
834 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
835 end select
836
837 pop_sub(accel_create_buffer_8)
838 end subroutine accel_create_buffer_8
839
840 ! ------------------------------------------
841
842 subroutine accel_release_buffer(this, async)
843 type(accel_mem_t), intent(inout) :: this
844 logical, optional, intent(in) :: async
845
846 logical :: put
847 integer(int64) :: fsize
848
849 push_sub(accel_release_buffer)
850
851 if (this%size > 0) then
852
853 fsize = int(this%size, int64)*types_get_size(this%type)
854
855 call alloc_cache_put(memcache, fsize, this%mem, put)
856
857 if (.not. put) then
858#ifdef HAVE_CUDA
859 if (optional_default(async, .false.)) then
860 call cuda_mem_free_async(this%mem)
861 else
862 call cuda_mem_free(this%mem)
863 end if
864#endif
865 end if
866
867 buffer_alloc_count = buffer_alloc_count - 1
868 allocated_mem = allocated_mem + fsize
869
870 end if
871
872 this%size = 0
873 this%flags = 0
874
875 this%allocated = .false.
876
877 pop_sub(accel_release_buffer)
878 end subroutine accel_release_buffer
879
880 ! ------------------------------------------------------
882 ! Check if the temporary buffers are the right size, if not reallocate them
883 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
884 type(accel_mem_t), intent(inout) :: buffer
885 integer, intent(in) :: flags
886 type(type_t), intent(in) :: type
887 integer, intent(in) :: required_size
888 logical, intent(in) :: set_zero
889 logical, optional, intent(in) :: async
890
892
893
894 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
895 call accel_release_buffer(buffer, async=optional_default(async, .false.))
896 end if
897
898 if (.not. accel_buffer_is_allocated(buffer)) then
899 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
900 end if
901
903 end subroutine accel_ensure_buffer_size
904
905 ! ------------------------------------------
906
907 logical pure function accel_buffer_is_allocated(this) result(allocated)
908 type(accel_mem_t), intent(in) :: this
909
910 allocated = this%allocated
911 end function accel_buffer_is_allocated
912
913 ! -----------------------------------------
914
915 subroutine accel_finish()
916 ! no push_sub, called too frequently
917
918 if (accel_is_enabled()) then
919#ifdef HAVE_CUDA
921#endif
922 end if
923 end subroutine accel_finish
924
925 ! ------------------------------------------
926
927 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
928 type(accel_kernel_t), intent(inout) :: kernel
929 integer, intent(in) :: narg
930 type(accel_mem_t), intent(in) :: buffer
931
932 assert(accel_buffer_is_allocated(buffer))
933
934 ! no push_sub, called too frequently
935#ifdef HAVE_CUDA
936 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
937#endif
938
939 end subroutine accel_set_kernel_arg_buffer
940
941 ! ------------------------------------------
942
943 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
944 type(accel_kernel_t), intent(inout) :: kernel
945 integer, intent(in) :: narg
946 type(type_t), intent(in) :: type
947 integer, intent(in) :: size
948
949 integer(int64) :: size_in_bytes
950
952
953
954 size_in_bytes = int(size, int64)*types_get_size(type)
955
956 if (size_in_bytes > accel%local_memory_size) then
957 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
958 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
959 call messages_fatal(2)
960 else if (size_in_bytes <= 0) then
961 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
962 call messages_fatal(1)
963 end if
964
965#ifdef HAVE_CUDA
966 kernel%cuda_shared_mem = size_in_bytes
967#endif
968
970 end subroutine accel_set_kernel_arg_local
971
972 ! ------------------------------------------
973
974 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
975 type(accel_kernel_t), intent(inout) :: kernel
976 integer(int64), intent(in) :: globalsizes(:)
977 integer(int64), intent(in) :: localsizes(:)
978
979 integer :: dim
980 integer(int64) :: gsizes(1:3)
981 integer(int64) :: lsizes(1:3)
983 ! no push_sub, called too frequently
984
985 ! cuda needs all dimensions
986 gsizes = 1
987 lsizes = 1
988
989 dim = ubound(globalsizes, dim=1)
990
991 assert(dim == ubound(localsizes, dim=1))
992
993 ! if one size is zero, there is nothing to do
994 if (any(globalsizes == 0)) return
995
996 assert(all(localsizes > 0))
997 assert(all(localsizes <= accel_max_workgroup_size()))
998 assert(all(mod(globalsizes, localsizes) == 0))
999
1000 gsizes(1:dim) = globalsizes(1:dim)
1001 lsizes(1:dim) = localsizes(1:dim)
1002
1003#ifdef HAVE_CUDA
1004 ! Maximum dimension of a block
1005 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1006 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1007 message(2) = "The following conditions should be fulfilled:"
1008 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1009 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1010 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1011 message(6) = "This is an internal error, please contact the developers."
1012 call messages_fatal(6)
1013 end if
1014
1015
1016 ! Maximum number of threads per block
1017 if (product(lsizes) > accel_max_workgroup_size()) then
1018 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1019 message(2) = "The following condition should be fulfilled:"
1020 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1021 message(4) = "This is an internal error, please contact the developers."
1022 call messages_fatal(4)
1023 end if
1024
1025 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1026
1027 ! Maximum dimensions of the grid of thread block
1028 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1029 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1030 message(2) = "The following conditions should be fulfilled:"
1031 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1032 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1033 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1034 message(6) = "This is an internal error, please contact the developers."
1035 call messages_fatal(6)
1036 end if
1037
1038 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1039
1040 kernel%cuda_shared_mem = 0
1041#endif
1042
1043 end subroutine accel_kernel_run_8
1044
1045 ! -----------------------------------------------
1047 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1048 type(accel_kernel_t), intent(inout) :: kernel
1049 integer, intent(in) :: globalsizes(:)
1050 integer, intent(in) :: localsizes(:)
1051
1052 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1053
1054 end subroutine accel_kernel_run_4
1055
1056 ! -----------------------------------------------
1058 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1059 max_workgroup_size = accel%max_workgroup_size
1060 end function accel_max_workgroup_size
1061
1062 ! -----------------------------------------------
1064 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1065 type(accel_kernel_t), intent(inout) :: kernel
1066
1067#ifdef HAVE_CUDA
1068 integer :: max_workgroup_size
1069#endif
1070
1071 workgroup_size = 0
1072
1073#ifdef HAVE_CUDA
1074 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1075 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1076 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1077 "than available on the device (", accel%max_workgroup_size, ")"
1078 call messages_info(1)
1079 end if
1080 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1081 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1082 workgroup_size = 256
1083 ! make sure we do not use more threads per block than available for this kernel
1084 workgroup_size = min(workgroup_size, max_workgroup_size)
1085#endif
1086
1087 end function accel_kernel_workgroup_size
1088
1089 ! ----------------------------------------------------
1090
1091 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1092 type(accel_mem_t), intent(inout) :: buffer
1093 type(type_t), intent(in) :: type
1094 integer(int8), intent(in) :: val
1095 integer(int64), intent(in) :: nval
1096 integer(int64), optional, intent(in) :: offset
1097 logical, optional, intent(in) :: async
1098
1099 integer(int64) :: nval_, offset_, type_size
1100
1101 push_sub(accel_set_buffer_to)
1102
1103 if (nval == 0) then
1104 pop_sub(accel_set_buffer_to)
1105 return
1106 end if
1107 assert(nval > 0)
1108
1109 if (present(offset)) then
1110 assert(offset >= 0)
1111 if(offset > buffer%size) then
1112 pop_sub(accel_set_buffer_to)
1113 return
1114 end if
1115 end if
1116
1117 type_size = types_get_size(type)
1118
1119 nval_ = nval*type_size
1120
1121 offset_ = 0_int64
1122 if (present(offset)) offset_ = offset*type_size
1123
1124 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1125 if(.not. optional_default(async, .false.)) call accel_finish()
1126
1127 pop_sub(accel_set_buffer_to)
1128 end subroutine accel_set_buffer_to
1129
1130 ! ----------------------------------------------------
1131
1132 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1133 type(accel_mem_t), intent(inout) :: buffer
1134 type(type_t), intent(in) :: type
1135 integer(int64), intent(in) :: nval
1136 integer(int64), optional, intent(in) :: offset
1137 logical, optional, intent(in) :: async
1138
1140
1141 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1142
1144 end subroutine accel_set_buffer_to_zero_i8
1145
1146 ! ----------------------------------------------------
1147
1148 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1149 type(accel_mem_t), intent(inout) :: buffer
1150 type(type_t), intent(in) :: type
1151 integer(int32), intent(in) :: nval
1152 integer(int32), optional, intent(in) :: offset
1153 logical, optional, intent(in) :: async
1154
1156
1157 if (present(offset)) then
1158 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1159 else
1160 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1161 end if
1162
1164 end subroutine accel_set_buffer_to_zero_i4
1165
1166 ! ----------------------------------------------------
1167
1168 subroutine accel_check_bandwidth()
1169 integer :: itime
1170 integer, parameter :: times = 10
1171 integer :: size
1172 real(real64) :: time, stime
1173 real(real64) :: read_bw, write_bw
1174 type(accel_mem_t) :: buff
1175 real(real64), allocatable :: data(:)
1176
1177 call messages_new_line()
1178 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1179 call messages_new_line()
1180 call messages_info()
1181
1182 call messages_write(' Buffer size Read bw Write bw')
1183 call messages_new_line()
1184 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1185 call messages_info()
1186
1187 size = 15000
1188 do
1189 safe_allocate(data(1:size))
1190 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1191
1192 stime = loct_clock()
1193 do itime = 1, times
1194 call accel_write_buffer(buff, size, data)
1195 call accel_finish()
1196 end do
1197 time = (loct_clock() - stime)/real(times, real64)
1198
1199 write_bw = real(size, real64) *8.0_real64/time
1200
1201 stime = loct_clock()
1202 do itime = 1, times
1203 call accel_read_buffer(buff, size, data)
1204 end do
1205 call accel_finish()
1206
1207 time = (loct_clock() - stime)/real(times, real64)
1208 read_bw = real(size, real64) *8.0_real64/time
1209
1210 call messages_write(size*8.0_real64/1024.0_real64**2)
1211 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1212 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1213 call messages_info()
1214
1215 call accel_release_buffer(buff)
1216
1217 safe_deallocate_a(data)
1218
1219 size = int(size*2.0)
1221 if (size > 50000000) exit
1222 end do
1223 end subroutine accel_check_bandwidth
1224
1225 ! ----------------------------------------------------
1226
1227 logical pure function accel_use_shared_mem() result(use_shared_mem)
1228
1229 use_shared_mem = accel%shared_mem
1230
1231 end function accel_use_shared_mem
1232
1233 !------------------------------------------------------------
1234
1235 subroutine accel_kernel_global_init()
1236
1237 push_sub(accel_kernel_global_init)
1238
1239 nullify(head)
1240
1241 call cuda_module_map_init(accel%module_map)
1242
1244 end subroutine accel_kernel_global_init
1245
1246 !------------------------------------------------------------
1247
1248 subroutine accel_kernel_global_end()
1249 type(accel_kernel_t), pointer :: next_head
1250
1251 push_sub(accel_kernel_global_end)
1252
1253 do
1254 if (.not. associated(head)) exit
1255 next_head => head%next
1257 head => next_head
1258 end do
1259
1260 if (accel_is_enabled()) then
1261 call cuda_module_map_end(accel%module_map)
1262 end if
1263
1265 end subroutine accel_kernel_global_end
1266
1267 !------------------------------------------------------------
1268
1269 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1270 type(accel_kernel_t), intent(inout) :: this
1271 character(len=*), intent(in) :: file_name
1272 character(len=*), intent(in) :: kernel_name
1273 character(len=*), optional, intent(in) :: flags
1274
1275#ifdef HAVE_CUDA
1276 character(len=1000) :: all_flags
1277#endif
1279 push_sub(accel_kernel_build)
1280
1281 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1282
1283#ifdef HAVE_CUDA
1284 all_flags = '-I'//trim(conf%share)//'/kernels/'//" "//trim(accel%debug_flag)
1285
1286 if (accel_use_shared_mem()) then
1287 all_flags = trim(all_flags)//' -DSHARED_MEM'
1288 end if
1289
1290 if (present(flags)) then
1291 all_flags = trim(all_flags)//' '//trim(flags)
1292 end if
1293
1294 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
1295 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1297 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1298 call cuda_alloc_arg_array(this%arguments)
1299
1300 this%cuda_shared_mem = 0
1301#endif
1302
1303 this%initialized = .true.
1304 this%kernel_name = trim(kernel_name)
1305
1306 call profiling_out("ACCEL_COMPILE")
1307
1308 pop_sub(accel_kernel_build)
1309 end subroutine accel_kernel_build
1310
1311 !------------------------------------------------------------
1312
1313 subroutine accel_kernel_end(this)
1314 type(accel_kernel_t), intent(inout) :: this
1316 push_sub(accel_kernel_end)
1317
1318#ifdef HAVE_CUDA
1319 call cuda_free_arg_array(this%arguments)
1320 call cuda_release_kernel(this%cuda_kernel)
1321 ! modules are not released here, since they are not associated to a kernel
1322#endif
1324 this%initialized = .false.
1325
1326 pop_sub(accel_kernel_end)
1327 end subroutine accel_kernel_end
1328
1329 !------------------------------------------------------------
1330
1331 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1332 type(accel_kernel_t), target, intent(inout) :: this
1333 character(len=*), intent(in) :: file_name
1334 character(len=*), intent(in) :: kernel_name
1335 character(len=*), optional, intent(in) :: flags
1336
1337 push_sub(accel_kernel_start_call)
1338
1339 if (.not. this%initialized) then
1340 call accel_kernel_build(this, file_name, kernel_name, flags)
1341 this%next => head
1342 head => this
1343 end if
1344
1346 end subroutine accel_kernel_start_call
1347
1348 !--------------------------------------------------------------
1349
1350 integer(int64) pure function accel_global_memory_size() result(size)
1351
1352 size = accel%global_memory_size
1353
1354 end function accel_global_memory_size
1355
1356 !--------------------------------------------------------------
1357
1358 integer(int64) pure function accel_local_memory_size() result(size)
1359
1360 size = accel%local_memory_size
1361
1362 end function accel_local_memory_size
1363
1364 !--------------------------------------------------------------
1365
1366 integer pure function accel_max_size_per_dim(dim) result(size)
1367 integer, intent(in) :: dim
1368
1369 size = 0
1370#ifdef HAVE_CUDA
1371 size = 32768
1372 if (dim == 1) size = 2**30
1373#endif
1374 end function accel_max_size_per_dim
1375
1376 ! ------------------------------------------------------
1377
1378 subroutine accel_set_stream(stream_number)
1379 integer, intent(in) :: stream_number
1380
1381 push_sub(accel_set_stream)
1382
1383 if (accel_is_enabled()) then
1384#ifdef HAVE_CUDA
1385 call cuda_set_stream(accel%cuda_stream, stream_number)
1386 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1387#endif
1388 end if
1389
1390 pop_sub(accel_set_stream)
1391 end subroutine accel_set_stream
1392
1393 ! ------------------------------------------------------
1394
1395 subroutine accel_get_stream(stream_number)
1396 integer, intent(inout) :: stream_number
1397
1398 push_sub(accel_get_stream)
1399
1400 if (accel_is_enabled()) then
1401#ifdef HAVE_CUDA
1402 call cuda_get_stream(stream_number)
1403#endif
1404 end if
1405
1406 pop_sub(accel_get_stream)
1407 end subroutine accel_get_stream
1408
1409 ! ------------------------------------------------------
1410
1413
1414 if (accel_is_enabled()) then
1415#ifdef HAVE_CUDA
1416 call cuda_synchronize_all_streams()
1417#endif
1418 end if
1419
1421 end subroutine accel_synchronize_all_streams
1422
1423 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1424 type(c_ptr), intent(in) :: buffer
1425 integer(int64), intent(in) :: offset
1426 type(c_ptr) :: buffer_offset
1427
1429#ifdef HAVE_CUDA
1430 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1431#else
1432 ! this is needed to make the compiler happy for non-GPU compilations
1433 buffer_offset = buffer
1434#endif
1437
1438 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1439 type(c_ptr), intent(in) :: buffer
1440 integer(int64), intent(in) :: offset
1441 type(c_ptr) :: buffer_offset
1442
1444#ifdef HAVE_CUDA
1445 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1446#else
1447 ! this is needed to make the compiler happy for non-GPU compilations
1448 buffer_offset = buffer
1449#endif
1452
1453 subroutine accel_clean_pointer(buffer)
1454 type(c_ptr), intent(in) :: buffer
1455
1456 push_sub(accel_clean_pointer)
1457#ifdef HAVE_CUDA
1458 call cuda_clean_pointer(buffer)
1459#endif
1460 pop_sub(accel_clean_pointer)
1461 end subroutine accel_clean_pointer
1462
1466 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
1467 integer(int64), intent(in) :: size
1468 integer(int64), intent(out) :: grid_size
1469 integer(int64), intent(out) :: thread_block_size
1470
1471 push_sub(accel_get_unfolded_size)
1472#ifdef __HIP_PLATFORM_AMD__
1473 ! not benefitial for AMD chips
1474 grid_size = size
1475 thread_block_size = size
1476#else
1477 grid_size = size * accel%warp_size
1478 thread_block_size = accel%warp_size
1479#endif
1481 end subroutine accel_get_unfolded_size
1482
1483#include "undef.F90"
1484#include "real.F90"
1485#include "accel_inc.F90"
1486
1487#include "undef.F90"
1488#include "complex.F90"
1489#include "accel_inc.F90"
1490
1491#include "undef.F90"
1492#include "integer.F90"
1493#include "accel_inc.F90"
1494
1495#include "undef.F90"
1496#include "integer8.F90"
1497#include "accel_inc.F90"
1498
1499end module accel_oct_m
1500
1501!! Local Variables:
1502!! mode: f90
1503!! coding: utf-8
1504!! End:
subroutine device_info()
Definition: accel.F90:643
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5756
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3543
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:279
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1419
subroutine accel_kernel_global_end()
Definition: accel.F90:1234
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2715
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1432
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5556
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5311
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3779
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:405
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2357
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3365
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:1213
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2075
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3452
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5335
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3588
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1833
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4906
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1805
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5643
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3198
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3030
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2612
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1994
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4660
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3089
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5667
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3995
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:280
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2994
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5151
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3120
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3760
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3615
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4984
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1644
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3918
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1297
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3810
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2765
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4710
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4400
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2861
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1361
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:813
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3247
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3888
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1316
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2049
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4856
type(accel_kernel_t), target, save, public zkernel_dot_matrix_spinors
Definition: accel.F90:278
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:282
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1975
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1589
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3436
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2470
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4215
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:983
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2448
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:783
subroutine accel_check_bandwidth()
Definition: accel.F90:1154
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4023
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2304
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:5221
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1530
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4683
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3228
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4615
subroutine, public accel_finish()
Definition: accel.F90:955
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5119
subroutine accel_kernel_global_init()
Definition: accel.F90:1221
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3305
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2103
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2665
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4547
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5590
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5779
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:923
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1077
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:5627
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3860
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1570
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2133
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3070
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4055
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2025
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4803
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5389
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2381
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4361
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2823
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2842
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1118
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3520
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:947
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2493
subroutine accel_kernel_end(this)
Definition: accel.F90:1279
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:281
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5091
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2960
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3476
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1620
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5457
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1389
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3707
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4571
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1332
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3144
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4125
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1935
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4342
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4460
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4930
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5014
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2684
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5419
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5734
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4593
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2739
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2900
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2341
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4165
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1014
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1255
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:415
subroutine, public accel_end(namespace)
Definition: accel.F90:700
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4816
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1698
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3170
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1377
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1344
subroutine, public accel_release_buffer(this, async)
Definition: accel.F90:882
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:5280
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2520
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4428
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4239
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5711
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4323
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4089
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:804
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1134
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1899
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5033
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4638
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3956
pure logical function, public accel_is_enabled()
Definition: accel.F90:395
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3399
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1728
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3834
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1766
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3937
integer, parameter, public accel_mem_write_only
Definition: accel.F90:183
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2928
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2210
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5185
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5438
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2152
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5524
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1670
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2425
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1047
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5806
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4184
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4956
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4875
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1865
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5052
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3498
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1404
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4494
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5496
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1517
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1747
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1064
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2625
type(accel_t), public accel
Definition: accel.F90:250
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5689
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4293
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4531
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2171
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4265
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3720
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3565
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:826
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:5261
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2270
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2238
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:1324
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:967
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1058
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5361
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3266
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2793
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2403
type(accel_kernel_t), pointer head
Definition: accel.F90:389
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3333
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:191
complex(real64), parameter, public m_z0
Definition: global.F90:201
complex(real64), parameter, public m_z1
Definition: global.F90:202
real(real64), parameter, public m_one
Definition: global.F90:192
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1023
subroutine, public messages_new_line()
Definition: messages.F90:1112
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)