Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_CUDA)
22#define HAVE_ACCEL 1
23#endif
24
25module accel_oct_m
27 use cuda_oct_m
28 use debug_oct_m
29 use global_oct_m
30 use iso_c_binding, only: c_size_t
31 use, intrinsic :: iso_fortran_env
32 use loct_oct_m
33 use math_oct_m
35 use mpi_oct_m
37 use types_oct_m
38 use parser_oct_m
42 use string_oct_m
43
44 implicit none
45
46 private
47
48 public :: &
53 accel_t, &
56 accel_init, &
57 accel_end, &
86
87 integer, public, parameter :: &
88 ACCEL_MEM_READ_ONLY = 0, &
91
93 ! Components are public by default
94#if defined(HAVE_CUDA)
95 type(c_ptr) :: cuda_context
96#else
97 integer :: dummy
98#endif
99 end type accel_context_t
100
101 type accel_device_t
102 ! Components are public by default
103#if defined(HAVE_CUDA)
104 type(c_ptr) :: cuda_device
105#else
106 integer :: dummy
107#endif
108 end type accel_device_t
109
110 type accel_t
111 ! Components are public by default
112 type(accel_context_t) :: context
113 type(accel_device_t) :: device
114 type(c_ptr) :: cublas_handle
115 type(c_ptr) :: cuda_stream
116 type(c_ptr) :: module_map
117 integer :: max_workgroup_size
118 integer(int64) :: shared_memory_size
119 integer(int64) :: global_memory_size
120 logical :: enabled
121 logical :: allow_CPU_only
122 logical :: cuda_mpi
123 integer :: warp_size
124 integer(int64) :: initialize_buffers
125 character(len=32) :: debug_flag
126 integer(int64) :: max_block_dim(3)
127 integer(int64) :: max_grid_dim(3)
128 end type accel_t
129
130 type accel_mem_t
131 ! Components are public by default
132 type(c_ptr) :: mem
133 integer(c_size_t) :: size = 0
134 type(type_t) :: type
135 integer :: flags = 0
136 logical :: allocated = .false.
137 end type accel_mem_t
138
139 type accel_kernel_t
140 ! Components are public by default
141#ifdef HAVE_CUDA
142 type(c_ptr) :: cuda_kernel
143 type(c_ptr) :: cuda_module
144 type(c_ptr) :: arguments
145#endif
146 integer(int64) :: cuda_shared_mem
147 logical :: initialized = .false.
148 type(accel_kernel_t), pointer :: next
149 integer :: arg_count
150 character(len=128) :: kernel_name
151 end type accel_kernel_t
152
153 type(accel_t), public :: accel
154
155 ! Global variables defined on device
156 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
157 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
158
159 ! the kernels
160 type(accel_kernel_t), public, target, save :: kernel_vpsi
161 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
162 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
163 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
164 type(accel_kernel_t), public, target, save :: kernel_daxpy
165 type(accel_kernel_t), public, target, save :: kernel_zaxpy
166 type(accel_kernel_t), public, target, save :: kernel_copy
167 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
168 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
169 type(accel_kernel_t), public, target, save :: dpack
170 type(accel_kernel_t), public, target, save :: zpack
171 type(accel_kernel_t), public, target, save :: dunpack
172 type(accel_kernel_t), public, target, save :: zunpack
173 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
174 type(accel_kernel_t), public, target, save :: kernel_density_real
175 type(accel_kernel_t), public, target, save :: kernel_density_complex
176 type(accel_kernel_t), public, target, save :: kernel_density_spinors
177 type(accel_kernel_t), public, target, save :: kernel_phase
178 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
179 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
180 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
181 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
182 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
183 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
184 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
185 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
186 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
187 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
188 type(accel_kernel_t), public, target, save :: dzmul
189 type(accel_kernel_t), public, target, save :: zzmul
190
191 interface accel_padded_size
193 end interface accel_padded_size
194
195 interface accel_create_buffer
197 end interface accel_create_buffer
198
199 interface accel_kernel_run
200 module procedure accel_kernel_run_4, accel_kernel_run_8
201 end interface accel_kernel_run
202
206
230 end interface accel_write_buffer
232 interface accel_read_buffer
247 end interface accel_read_buffer
249 interface accel_set_kernel_arg
250 module procedure &
274 module procedure &
282 module procedure &
288
289 integer :: buffer_alloc_count
290 integer(int64) :: allocated_mem
291 type(accel_kernel_t), pointer :: head
292 type(alloc_cache_t) :: memcache
293
294contains
295
296 pure logical function accel_is_enabled() result(enabled)
297#ifdef HAVE_ACCEL
298 enabled = accel%enabled
299#else
300 enabled = .false.
301#endif
302 end function accel_is_enabled
303
304 ! ------------------------------------------
305
306 pure logical function accel_allow_cpu_only() result(allow)
307#ifdef HAVE_ACCEL
308 allow = accel%allow_CPU_only
309#else
310 allow = .true.
311#endif
312 end function accel_allow_cpu_only
313
314 ! ------------------------------------------
315
316 subroutine accel_init(base_grp, namespace)
317 type(mpi_grp_t), intent(inout) :: base_grp
318 type(namespace_t), intent(in) :: namespace
319
320 logical :: disable, default, run_benchmark
321 integer :: idevice
322#ifdef HAVE_CUDA
323 integer :: dim
324#ifdef HAVE_MPI
325 character(len=256) :: sys_name
326#endif
327#endif
328
329 push_sub(accel_init)
330
331 buffer_alloc_count = 0
332
333 !%Variable DisableAccel
334 !%Type logical
335 !%Default yes
336 !%Section Execution::Accel
337 !%Description
338 !% If Octopus was compiled with CUDA support, it will
339 !% try to initialize and use an accelerator device. By setting this
340 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
341 !%End
342 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
343#ifdef HAVE_ACCEL
344 default = .false.
345#else
346 default = .true.
347#endif
348 call parse_variable(namespace, 'DisableAccel', default, disable)
349 accel%enabled = .not. disable
350
351#ifndef HAVE_ACCEL
352 if (accel%enabled) then
353 message(1) = 'Octopus was compiled without Cuda support.'
354 call messages_fatal(1)
355 end if
356#endif
357
358 if (.not. accel_is_enabled()) then
359 pop_sub(accel_init)
360 return
361 end if
362
363 call messages_obsolete_variable(namespace, 'AccelPlatform')
364 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
365
366 !%Variable AccelDevice
367 !%Type integer
368 !%Default 0
369 !%Section Execution::Accel
370 !%Description
371 !% This variable selects the GPU that Octopus will use. You can specify a
372 !% numerical id to select a specific device.
373 !%
374 !% In case of MPI enabled runs devices are distributed in a round robin fashion,
375 !% starting at this value.
376 !%End
377 call parse_variable(namespace, 'AccelDevice', 0, idevice)
378
379 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
380
381 if (idevice < 0) then
382 call messages_write('Invalid AccelDevice')
383 call messages_fatal()
384 end if
386 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
388#ifdef HAVE_CUDA
389 if (idevice<0) idevice = 0
390 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
391 idevice, base_grp%rank)
392#ifdef HAVE_MPI
393 call loct_sysname(sys_name)
394 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
395 " on ", trim(sys_name)
396 call messages_info(1, all_nodes = .true.)
397#endif
398
399 call cublas_init(accel%cublas_handle, accel%cuda_stream)
400#endif
402
403 ! Get some device information that we will need later
404#ifdef HAVE_CUDA
405 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
406 call cuda_device_shared_memory(accel%device%cuda_device, accel%shared_memory_size)
407 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
408 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
409 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
410 accel%max_block_dim(1) = int(dim, int64)
411 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
412 accel%max_block_dim(2) = int(dim, int64)
413 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
414 accel%max_block_dim(3) = int(dim, int64)
415 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
416 accel%max_grid_dim(1) = int(dim, int64)
417 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
418 accel%max_grid_dim(2) = int(dim, int64)
419 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
420 accel%max_grid_dim(3) = int(dim, int64)
421#endif
422
423 if (base_grp%is_root()) call device_info()
424
425 ! initialize the cache used to speed up allocations
426 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
427
428 ! now initialize the kernels
430
431#if defined(HAVE_HIP)
432 accel%debug_flag = "-g"
433#elif defined(HAVE_CUDA)
434 accel%debug_flag = "-lineinfo"
435#endif
436
437 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cu', "vpsi")
438 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cu', "vpsi_complex")
439 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cu', "vpsi_spinors")
440 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cu', "vpsi_spinors_complex")
441 call accel_kernel_start_call(kernel_daxpy, 'axpy.cu', "daxpy", flags = '-DRTYPE_DOUBLE')
442 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cu', "zaxpy", flags = '-DRTYPE_COMPLEX')
443 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cu', "dbatch_axpy_function", &
444 flags = ' -DRTYPE_DOUBLE')
445 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cu', "zbatch_axpy_function", &
446 flags = '-DRTYPE_COMPLEX')
447 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cu', "dbatch_ax_function_py", &
448 flags = '-DRTYPE_DOUBLE')
449 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cu', "zbatch_ax_function_py", &
450 flags = '-DRTYPE_COMPLEX')
451 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cu', "dbatch_mf_dotp")
452 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cu', "zbatch_mf_dotp")
453 call accel_kernel_start_call(dpack, 'pack.cu', "dpack")
454 call accel_kernel_start_call(zpack, 'pack.cu', "zpack")
455 call accel_kernel_start_call(dunpack, 'pack.cu', "dunpack")
456 call accel_kernel_start_call(zunpack, 'pack.cu', "zunpack")
457 call accel_kernel_start_call(kernel_copy, 'copy.cu', "copy")
458 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cu', "copy_complex_to_real")
459 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cu', "copy_real_to_complex")
460 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cu', "ghost_reorder")
461 call accel_kernel_start_call(kernel_density_real, 'density.cu', "density_real")
462 call accel_kernel_start_call(kernel_density_complex, 'density.cu', "density_complex")
463 call accel_kernel_start_call(kernel_density_spinors, 'density.cu', "density_spinors")
464 call accel_kernel_start_call(kernel_phase, 'phase.cu', "phase")
465 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cu', "ddot_matrix")
466 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cu', "zdot_matrix")
467 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cu', "zdot_matrix_spinors")
468
469
470 call accel_kernel_start_call(dzmul, 'mul.cu', "dzmul", flags = '-DRTYPE_DOUBLE')
471 call accel_kernel_start_call(zzmul, 'mul.cu', "zzmul", flags = '-DRTYPE_COMPLEX')
472
473 ! Define global buffers
474 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
475 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
476 call accel_write_buffer(zm_0_buffer, m_z0)
477 end if
478 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
479 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
480 call accel_write_buffer(zm_1_buffer, m_z1)
481 end if
482 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
483 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
484 call accel_write_buffer(dm_0_buffer, m_zero)
485 end if
486 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
487 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
488 call accel_write_buffer(dm_1_buffer, m_one)
489 end if
490
491
492 !%Variable AccelBenchmark
493 !%Type logical
494 !%Default no
495 !%Section Execution::Accel
496 !%Description
497 !% If this variable is set to yes, Octopus will run some
498 !% routines to benchmark the performance of the accelerator device.
499 !%End
500 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
501
502 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
503
504 if (run_benchmark) then
506 end if
507
508 !%Variable GPUAwareMPI
509 !%Type logical
510 !%Section Execution::Accel
511 !%Description
512 !% If Octopus was compiled with GPU support and MPI support and if the MPI
513 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
514 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
515 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
516 !% going through the host memory.
517 !% The default is false, except when the configure switch --enable-cudampi is set, in which
518 !% case this variable is set to true.
519 !%End
520#ifdef HAVE_CUDA_MPI
521 default = .true.
522#else
523 default = .false.
524#endif
525 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
526 if (accel%cuda_mpi) then
527#ifndef HAVE_CUDA_MPI
528 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
529 call messages_warning()
530#endif
531 call messages_write("Using GPU-aware MPI.")
532 call messages_info()
533 end if
534
535
536 !%Variable AllowCPUonly
537 !%Type logical
538 !%Section Execution::Accel
539 !%Description
540 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
541 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
542 !% code execution also in these cases.
543 !%End
544#if defined (HAVE_ACCEL)
545 default = .false.
546#else
547 default = .true.
548#endif
549 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
550
551
552 !%Variable InitializeGPUBuffers
553 !%Type integer
554 !%Default no
555 !%Section Execution::Accel
556 !%Description
557 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
558 !%Option no 0
559 !% Do not initialize GPU buffers.
560 !%Option yes 1
561 !% Initialize GPU buffers to zero.
562 !%Option nan 2
563 !% Initialize GPU buffers to nan.
564 !%End
565 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
566 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
567 call messages_input_error(namespace, 'InitializeGPUBuffers')
568 end if
569
570
571 call messages_print_with_emphasis(namespace=namespace)
572
573 pop_sub(accel_init)
574
575 contains
576
577 subroutine device_info()
578#ifdef HAVE_CUDA
579 integer :: version
580 character(kind=c_char) :: cval_str(257)
581#endif
582 integer :: major, minor
583 character(len=256) :: val_str
584
585 push_sub(accel_init.device_info)
586
587 call messages_new_line()
588 call messages_write('Selected device:')
589 call messages_new_line()
590
591#ifdef HAVE_CUDA
592#ifdef __HIP_PLATFORM_AMD__
593 call messages_write(' Framework : ROCm')
594#else
595 call messages_write(' Framework : CUDA')
596#endif
597#endif
598 call messages_info()
599
600#ifdef HAVE_CUDA
601 call messages_write(' Device type : GPU', new_line = .true.)
602#ifdef __HIP_PLATFORM_AMD__
603 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
604#else
605 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
606#endif
607#endif
608
609#ifdef HAVE_CUDA
610 cval_str = c_null_char
611 call cuda_device_name(accel%device%cuda_device, cval_str)
612 call string_c_to_f(cval_str, val_str)
613#endif
614 call messages_write(' Device name : '//trim(val_str))
615 call messages_new_line()
616
617#ifdef HAVE_CUDA
618 call cuda_device_capability(accel%device%cuda_device, major, minor)
619#endif
620 call messages_write(' Cuda capabilities :')
621 call messages_write(major, fmt = '(i2)')
622 call messages_write('.')
623 call messages_write(minor, fmt = '(i1)')
624 call messages_new_line()
625
626 ! VERSION
627#ifdef HAVE_CUDA
628 call cuda_driver_version(version)
629 call messages_write(' Driver version : ')
630 call messages_write(version)
631#endif
632 call messages_new_line()
633
634
635 call messages_write(' Device memory :')
636 call messages_write(accel%global_memory_size, units=unit_megabytes)
637 call messages_new_line()
638
639 call messages_write(' Shared memory :')
640 call messages_write(accel%shared_memory_size, units=unit_kilobytes)
641 call messages_new_line()
642
643 call messages_write(' Max. group/block size :')
644 call messages_write(accel%max_workgroup_size)
645 call messages_new_line()
646
647 call messages_info()
648
649 pop_sub(accel_init.device_info)
650 end subroutine device_info
651
652 end subroutine accel_init
653
654 ! ------------------------------------------
655 subroutine accel_end(namespace)
656 type(namespace_t), intent(in) :: namespace
657
658 integer(int64) :: hits, misses
659 real(real64) :: volume_hits, volume_misses
660 logical :: found
661 type(accel_mem_t) :: tmp
662
663 push_sub(accel_end)
664
665 if (accel_is_enabled()) then
666
667 ! Release global buffers
668 call accel_release_buffer(zm_0_buffer)
669 call accel_release_buffer(zm_1_buffer)
670 call accel_release_buffer(dm_0_buffer)
671 call accel_release_buffer(dm_1_buffer)
672
673 do
674 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
675 if (.not. found) exit
676
677#ifdef HAVE_CUDA
678 call cuda_mem_free(tmp%mem)
679#endif
680 end do
681
682 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
683
684 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
685
686 call messages_new_line()
687 call messages_write(' Number of allocations =')
688 call messages_write(hits + misses, new_line = .true.)
689 call messages_write(' Volume of allocations =')
690 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
691 new_line = .true.)
692 call messages_write(' Hit ratio =')
693 if (hits + misses > 0) then
694 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
695 else
696 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
697 end if
698 call messages_write('%', new_line = .true.)
699 call messages_write(' Volume hit ratio =')
700 if (volume_hits + volume_misses > 0) then
701 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
702 else
703 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
704 end if
705 call messages_write('%')
706 call messages_new_line()
707 call messages_info()
708
709 call messages_print_with_emphasis(namespace=namespace)
710 end if
711
713
714 if (accel_is_enabled()) then
715#ifdef HAVE_CUDA
716 call cublas_end(accel%cublas_handle)
717 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
718 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
719 end if
720#endif
721
722 if (buffer_alloc_count /= 0) then
723 call messages_write('Accel:')
724 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
725 call messages_write(' in ')
726 call messages_write(buffer_alloc_count)
727 call messages_write(' buffers were not deallocated.')
728 call messages_fatal()
729 end if
730
731 end if
732
733 pop_sub(accel_end)
734 end subroutine accel_end
735
736 ! ------------------------------------------
737
738 integer(int64) function accel_padded_size_i8(nn) result(psize)
739 integer(int64), intent(in) :: nn
740
741 integer(int64) :: modnn, bsize
742
743 psize = nn
744
745 if (accel_is_enabled()) then
746
748
749 psize = nn
750 modnn = mod(nn, bsize)
751 if (modnn /= 0) psize = psize + bsize - modnn
752
753 end if
754
755 end function accel_padded_size_i8
756
757 ! ------------------------------------------
758
759 integer(int32) function accel_padded_size_i4(nn) result(psize)
760 integer(int32), intent(in) :: nn
761
762 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
763
764 end function accel_padded_size_i4
765
766 ! ------------------------------------------
767
768 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
769 type(accel_mem_t), intent(inout) :: this
770 integer, intent(in) :: flags
771 type(type_t), intent(in) :: type
772 integer, intent(in) :: size
773 logical, optional, intent(in) :: set_zero
774 logical, optional, intent(in) :: async
775
776 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
777 end subroutine accel_create_buffer_4
778
779 ! ------------------------------------------
780
781 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
782 type(accel_mem_t), intent(inout) :: this
783 integer, intent(in) :: flags
784 type(type_t), intent(in) :: type
785 integer(int64), intent(in) :: size
786 logical, optional, intent(in) :: set_zero
787 logical, optional, intent(in) :: async
788
789 integer(int64) :: fsize
790 logical :: found
791 integer(int64) :: initialize_buffers
792
793 push_sub(accel_create_buffer_8)
794
795 this%type = type
796 this%size = size
797 this%flags = flags
798 fsize = int(size, int64)*types_get_size(type)
799 this%allocated = .true.
801 if (fsize > 0) then
802
803 call alloc_cache_get(memcache, fsize, found, this%mem)
804
805 if (.not. found) then
806#ifdef HAVE_CUDA
807 if(optional_default(async, .false.)) then
808 call cuda_mem_alloc_async(this%mem, fsize)
809 else
810 call cuda_mem_alloc(this%mem, fsize)
811 end if
812#endif
813 end if
814
815 buffer_alloc_count = buffer_alloc_count + 1
816 allocated_mem = allocated_mem + fsize
817
818 end if
819
820 if (present(set_zero)) then
821 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
822 else
823 initialize_buffers = accel%initialize_buffers
824 end if
825 select case (initialize_buffers)
826 case (option__initializegpubuffers__yes)
827 call accel_set_buffer_to(this, type, int(z'00', int8), size)
828 case (option__initializegpubuffers__nan)
829 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
830 end select
831
832 pop_sub(accel_create_buffer_8)
833 end subroutine accel_create_buffer_8
834
835 ! ------------------------------------------
836
837 subroutine accel_release_buffer(this, async)
838 type(accel_mem_t), intent(inout) :: this
839 logical, optional, intent(in) :: async
840
841 logical :: put
842 integer(int64) :: fsize
843
844 push_sub(accel_release_buffer)
845
846 if (this%size > 0) then
847
848 fsize = int(this%size, int64)*types_get_size(this%type)
849
850 call alloc_cache_put(memcache, fsize, this%mem, put)
851
852 if (.not. put) then
853#ifdef HAVE_CUDA
854 if (optional_default(async, .false.)) then
855 call cuda_mem_free_async(this%mem)
856 else
857 call cuda_mem_free(this%mem)
858 end if
859#endif
860 end if
861
862 buffer_alloc_count = buffer_alloc_count - 1
863 allocated_mem = allocated_mem + fsize
864
865 end if
866
867 this%size = 0
868 this%flags = 0
869
870 this%allocated = .false.
871
872 pop_sub(accel_release_buffer)
873 end subroutine accel_release_buffer
874
875 ! ------------------------------------------------------
876
877 ! Check if the temporary buffers are the right size, if not reallocate them
878 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
879 type(accel_mem_t), intent(inout) :: buffer
880 integer, intent(in) :: flags
881 type(type_t), intent(in) :: type
882 integer, intent(in) :: required_size
883 logical, intent(in) :: set_zero
884 logical, optional, intent(in) :: async
885
887
888
889 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
890 call accel_release_buffer(buffer, async=optional_default(async, .false.))
891 end if
892
893 if (.not. accel_buffer_is_allocated(buffer)) then
894 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
895 end if
896
898 end subroutine accel_ensure_buffer_size
899
900 ! ------------------------------------------
901
902 logical pure function accel_buffer_is_allocated(this) result(allocated)
903 type(accel_mem_t), intent(in) :: this
904
905 allocated = this%allocated
906 end function accel_buffer_is_allocated
907
908 ! -----------------------------------------
909
910 subroutine accel_finish()
911 ! no push_sub, called too frequently
912
913 if (accel_is_enabled()) then
914#ifdef HAVE_CUDA
916#endif
917 end if
918 end subroutine accel_finish
920 ! ------------------------------------------
921
922 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
923 type(accel_kernel_t), intent(inout) :: kernel
924 integer, intent(in) :: narg
925 type(accel_mem_t), intent(in) :: buffer
926
927 assert(accel_buffer_is_allocated(buffer))
928
929 ! no push_sub, called too frequently
930#ifdef HAVE_CUDA
931 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
932#endif
933
934 end subroutine accel_set_kernel_arg_buffer
935
936 ! ------------------------------------------
937
944 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes, shared_memory_size)
945 type(accel_kernel_t), intent(inout) :: kernel
946 integer(int64), intent(in) :: globalsizes(:)
947 integer(int64), intent(in) :: localsizes(:)
948 integer(int64), optional, intent(in) :: shared_memory_size
949
950 integer :: dim
951 integer(int64) :: gsizes(1:3)
952 integer(int64) :: lsizes(1:3)
953
954 ! no push_sub, called too frequently
955
956 ! CUDA needs all dimensions
957 gsizes = 1
958 lsizes = 1
959
960 dim = ubound(globalsizes, dim=1)
961
962 assert(dim == ubound(localsizes, dim=1))
964 ! if one size is zero, there is nothing to do
965 if (any(globalsizes == 0)) return
966
967 assert(all(localsizes > 0))
968 assert(all(localsizes <= accel_max_workgroup_size()))
969 assert(all(mod(globalsizes, localsizes) == 0))
970
971 gsizes(1:dim) = globalsizes(1:dim)
972 lsizes(1:dim) = localsizes(1:dim)
973
974#ifdef HAVE_CUDA
975 ! Maximum dimension of a block
976 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
977 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
978 message(2) = "The following conditions should be fulfilled:"
979 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
980 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
981 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
982 message(6) = "This is an internal error, please contact the developers."
983 call messages_fatal(6)
984 end if
986
987 ! Maximum number of threads per block
988 if (product(lsizes) > accel_max_workgroup_size()) then
989 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
990 message(2) = "The following condition should be fulfilled:"
991 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
992 message(4) = "This is an internal error, please contact the developers."
993 call messages_fatal(4)
994 end if
995
996 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
997
998 ! Maximum dimensions of the grid of thread block
999 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1000 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1001 message(2) = "The following conditions should be fulfilled:"
1002 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1003 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1004 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1005 message(6) = "This is an internal error, please contact the developers."
1006 call messages_fatal(6)
1007 end if
1008
1009 if(present(shared_memory_size)) then
1010
1011 if (shared_memory_size > accel%shared_memory_size) then
1012 message(1) = "Shared memory too large in kernel "//trim(kernel%kernel_name)
1013 message(2) = "The following conditions should be fulfilled:"
1014 message(3) = "Requested shared memory <= Available shared memory"
1015 write(message(4), '(a,f12.6,a)') "Requested shared memory: ", real(shared_memory_size, real64) /1024.0, " Kb"
1016 write(message(5), '(a,f12.6,a)') "Available shared memory: ", real(accel%shared_memory_size, real64) /1024.0, " Kb"
1017 message(6) = "This is an internal error, please contact the developers."
1018 call messages_fatal(6)
1019 else if (shared_memory_size <= 0) then
1020 message(1) = "Invalid shared memory size in kernel "//trim(kernel%kernel_name)
1021 write(message(2), '(a,i10)') "Shared memory size requested: ", shared_memory_size
1022 message(3) = "This is an internal error, please contact the developers."
1023 call messages_fatal(3)
1024 end if
1026 kernel%cuda_shared_mem = shared_memory_size
1027 else
1028 kernel%cuda_shared_mem = 0
1029 end if
1030
1031 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1032
1033 kernel%cuda_shared_mem = 0
1034#endif
1035
1036 end subroutine accel_kernel_run_8
1038 ! -----------------------------------------------
1039
1045
1046 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes, shared_memory_size)
1047 type(accel_kernel_t), intent(inout) :: kernel
1048 integer, intent(in) :: globalsizes(:)
1049 integer, intent(in) :: localsizes(:)
1050 integer(int64), optional, intent(in) :: shared_memory_size
1051
1052 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64), shared_memory_size)
1053
1054 end subroutine accel_kernel_run_4
1055
1056 ! -----------------------------------------------
1057
1058 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1059 max_workgroup_size = accel%max_workgroup_size
1060 end function accel_max_workgroup_size
1061
1062 ! -----------------------------------------------
1063
1064 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1065 type(accel_kernel_t), intent(inout) :: kernel
1066
1067#ifdef HAVE_CUDA
1068 integer :: max_workgroup_size
1069#endif
1070
1071 workgroup_size = 0
1072
1073#ifdef HAVE_CUDA
1074 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1075 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1076 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1077 "than available on the device (", accel%max_workgroup_size, ")"
1078 call messages_info(1)
1079 end if
1080 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1081 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1082 workgroup_size = 256
1083 ! make sure we do not use more threads per block than available for this kernel
1084 workgroup_size = min(workgroup_size, max_workgroup_size)
1085#endif
1086
1087 end function accel_kernel_workgroup_size
1088
1089 ! ----------------------------------------------------
1090
1091 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1092 type(accel_mem_t), intent(inout) :: buffer
1093 type(type_t), intent(in) :: type
1094 integer(int8), intent(in) :: val
1095 integer(int64), intent(in) :: nval
1096 integer(int64), optional, intent(in) :: offset
1097 logical, optional, intent(in) :: async
1098
1099 integer(int64) :: nval_, offset_, type_size
1100
1101 push_sub(accel_set_buffer_to)
1102
1103 if (nval == 0) then
1104 pop_sub(accel_set_buffer_to)
1105 return
1106 end if
1107 assert(nval > 0)
1108
1109 if (present(offset)) then
1110 assert(offset >= 0)
1111 if(offset > buffer%size) then
1112 pop_sub(accel_set_buffer_to)
1113 return
1114 end if
1115 end if
1116
1117 type_size = types_get_size(type)
1118
1119 nval_ = nval*type_size
1120
1121 offset_ = 0_int64
1122 if (present(offset)) offset_ = offset*type_size
1123
1124 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1125 if(.not. optional_default(async, .false.)) call accel_finish()
1126
1127 pop_sub(accel_set_buffer_to)
1128 end subroutine accel_set_buffer_to
1129
1130 ! ----------------------------------------------------
1131
1132 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1133 type(accel_mem_t), intent(inout) :: buffer
1134 type(type_t), intent(in) :: type
1135 integer(int64), intent(in) :: nval
1136 integer(int64), optional, intent(in) :: offset
1137 logical, optional, intent(in) :: async
1138
1140
1141 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1142
1144 end subroutine accel_set_buffer_to_zero_i8
1145
1146 ! ----------------------------------------------------
1147
1148 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1149 type(accel_mem_t), intent(inout) :: buffer
1150 type(type_t), intent(in) :: type
1151 integer(int32), intent(in) :: nval
1152 integer(int32), optional, intent(in) :: offset
1153 logical, optional, intent(in) :: async
1154
1156
1157 if (present(offset)) then
1158 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1159 else
1160 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1161 end if
1162
1164 end subroutine accel_set_buffer_to_zero_i4
1165
1166 ! ----------------------------------------------------
1167
1168 subroutine accel_check_bandwidth()
1169 integer :: itime
1170 integer, parameter :: times = 10
1171 integer :: size
1172 real(real64) :: time, stime
1173 real(real64) :: read_bw, write_bw
1174 type(accel_mem_t) :: buff
1175 real(real64), allocatable :: data(:)
1176
1177 call messages_new_line()
1178 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1179 call messages_new_line()
1180 call messages_info()
1181
1182 call messages_write(' Buffer size Read bw Write bw')
1183 call messages_new_line()
1184 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1185 call messages_info()
1186
1187 size = 15000
1188 do
1189 safe_allocate(data(1:size))
1190 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1191
1192 stime = loct_clock()
1193 do itime = 1, times
1194 call accel_write_buffer(buff, size, data)
1195 call accel_finish()
1196 end do
1197 time = (loct_clock() - stime)/real(times, real64)
1198
1199 write_bw = real(size, real64) *8.0_real64/time
1200
1201 stime = loct_clock()
1202 do itime = 1, times
1203 call accel_read_buffer(buff, size, data)
1204 end do
1206
1207 time = (loct_clock() - stime)/real(times, real64)
1208 read_bw = real(size, real64) *8.0_real64/time
1209
1210 call messages_write(size*8.0_real64/1024.0_real64**2)
1211 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1212 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1213 call messages_info()
1214
1215 call accel_release_buffer(buff)
1216
1217 safe_deallocate_a(data)
1218
1219 size = int(size*2.0)
1220
1221 if (size > 50000000) exit
1222 end do
1223 end subroutine accel_check_bandwidth
1224
1225 !------------------------------------------------------------
1227 subroutine accel_kernel_global_init()
1228
1229 push_sub(accel_kernel_global_init)
1230
1231 nullify(head)
1232
1233 call cuda_module_map_init(accel%module_map)
1234
1236 end subroutine accel_kernel_global_init
1237
1238 !------------------------------------------------------------
1239
1240 subroutine accel_kernel_global_end()
1241 type(accel_kernel_t), pointer :: next_head
1242
1243 push_sub(accel_kernel_global_end)
1244
1245 do
1246 if (.not. associated(head)) exit
1247 next_head => head%next
1249 head => next_head
1250 end do
1251
1252 if (accel_is_enabled()) then
1253 call cuda_module_map_end(accel%module_map)
1254 end if
1255
1257 end subroutine accel_kernel_global_end
1258
1259 !------------------------------------------------------------
1260
1261 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1262 type(accel_kernel_t), intent(inout) :: this
1263 character(len=*), intent(in) :: file_name
1264 character(len=*), intent(in) :: kernel_name
1265 character(len=*), optional, intent(in) :: flags
1266
1267#ifdef HAVE_CUDA
1268 character(len=1000) :: all_flags
1269#endif
1270
1271 push_sub(accel_kernel_build)
1272
1273 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1274
1275#ifdef HAVE_CUDA
1276 all_flags = '-I'//trim(conf%share)//'/kernels/'//" "//trim(accel%debug_flag)
1277
1278 if (present(flags)) then
1279 all_flags = trim(all_flags)//' '//trim(flags)
1280 end if
1281
1282 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
1283 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1284
1285 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1286 call cuda_alloc_arg_array(this%arguments)
1288 this%cuda_shared_mem = 0
1289#endif
1290
1291 this%initialized = .true.
1292 this%kernel_name = trim(kernel_name)
1293
1294 call profiling_out("ACCEL_COMPILE")
1296 pop_sub(accel_kernel_build)
1297 end subroutine accel_kernel_build
1298
1299 !------------------------------------------------------------
1300
1301 subroutine accel_kernel_end(this)
1302 type(accel_kernel_t), intent(inout) :: this
1303
1304 push_sub(accel_kernel_end)
1305
1306#ifdef HAVE_CUDA
1307 call cuda_free_arg_array(this%arguments)
1308 call cuda_release_kernel(this%cuda_kernel)
1309 ! modules are not released here, since they are not associated to a kernel
1310#endif
1311
1312 this%initialized = .false.
1313
1315 end subroutine accel_kernel_end
1316
1317 !------------------------------------------------------------
1318
1319 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1320 type(accel_kernel_t), target, intent(inout) :: this
1321 character(len=*), intent(in) :: file_name
1322 character(len=*), intent(in) :: kernel_name
1323 character(len=*), optional, intent(in) :: flags
1324
1325 push_sub(accel_kernel_start_call)
1326
1327 if (.not. this%initialized) then
1328 call accel_kernel_build(this, file_name, kernel_name, flags)
1329 this%next => head
1330 head => this
1331 end if
1332
1334 end subroutine accel_kernel_start_call
1335
1336 !--------------------------------------------------------------
1337
1338 integer(int64) pure function accel_global_memory_size() result(size)
1339
1340 size = accel%global_memory_size
1341
1342 end function accel_global_memory_size
1343
1344 !--------------------------------------------------------------
1345
1346 integer(int64) pure function accel_shared_memory_size() result(size)
1348 size = accel%shared_memory_size
1349
1350 end function accel_shared_memory_size
1351 !--------------------------------------------------------------
1352
1353 integer pure function accel_max_size_per_dim(dim) result(size)
1354 integer, intent(in) :: dim
1355
1356 size = 0
1357#ifdef HAVE_CUDA
1358 size = 32768
1359 if (dim == 1) size = 2**30
1360#endif
1361 end function accel_max_size_per_dim
1362
1363 ! ------------------------------------------------------
1364
1365 subroutine accel_set_stream(stream_number)
1366 integer, intent(in) :: stream_number
1367
1368 push_sub(accel_set_stream)
1369
1370 if (accel_is_enabled()) then
1371#ifdef HAVE_CUDA
1372 call cuda_set_stream(accel%cuda_stream, stream_number)
1373 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1374#endif
1375 end if
1376
1377 pop_sub(accel_set_stream)
1378 end subroutine accel_set_stream
1379
1380 ! ------------------------------------------------------
1381
1382 subroutine accel_get_stream(stream_number)
1383 integer, intent(inout) :: stream_number
1384
1385 push_sub(accel_get_stream)
1386
1387 if (accel_is_enabled()) then
1388#ifdef HAVE_CUDA
1389 call cuda_get_stream(stream_number)
1390#endif
1391 end if
1392
1393 pop_sub(accel_get_stream)
1394 end subroutine accel_get_stream
1395
1396 ! ------------------------------------------------------
1397
1400
1401 if (accel_is_enabled()) then
1402#ifdef HAVE_CUDA
1403 call cuda_synchronize_all_streams()
1404#endif
1405 end if
1406
1408 end subroutine accel_synchronize_all_streams
1409
1410 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1411 type(c_ptr), intent(in) :: buffer
1412 integer(int64), intent(in) :: offset
1413 type(c_ptr) :: buffer_offset
1414
1416#ifdef HAVE_CUDA
1417 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1418#else
1419 ! this is needed to make the compiler happy for non-GPU compilations
1420 buffer_offset = buffer
1421#endif
1424
1425 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1426 type(c_ptr), intent(in) :: buffer
1427 integer(int64), intent(in) :: offset
1428 type(c_ptr) :: buffer_offset
1429
1431#ifdef HAVE_CUDA
1432 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1433#else
1434 ! this is needed to make the compiler happy for non-GPU compilations
1435 buffer_offset = buffer
1436#endif
1439
1440 subroutine accel_clean_pointer(buffer)
1441 type(c_ptr), intent(in) :: buffer
1442
1443 push_sub(accel_clean_pointer)
1444#ifdef HAVE_CUDA
1445 call cuda_clean_pointer(buffer)
1446#endif
1447 pop_sub(accel_clean_pointer)
1448 end subroutine accel_clean_pointer
1449
1453 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
1454 integer(int64), intent(in) :: size
1455 integer(int64), intent(out) :: grid_size
1456 integer(int64), intent(out) :: thread_block_size
1457
1458 push_sub(accel_get_unfolded_size)
1459#ifdef __HIP_PLATFORM_AMD__
1460 ! not benefitial for AMD chips
1461 grid_size = size
1462 thread_block_size = size
1463#else
1464 grid_size = size * accel%warp_size
1465 thread_block_size = accel%warp_size
1466#endif
1468 end subroutine accel_get_unfolded_size
1469
1470#include "undef.F90"
1471#include "real.F90"
1472#include "accel_inc.F90"
1473
1474#include "undef.F90"
1475#include "complex.F90"
1476#include "accel_inc.F90"
1477
1478#include "undef.F90"
1479#include "integer.F90"
1480#include "accel_inc.F90"
1481
1482#include "undef.F90"
1483#include "integer8.F90"
1484#include "accel_inc.F90"
1485
1486end module accel_oct_m
1488!! Local Variables:
1489!! mode: f90
1490!! coding: utf-8
1491!! End:
subroutine device_info()
Definition: accel.F90:640
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5727
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3514
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:277
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1390
subroutine accel_kernel_global_end()
Definition: accel.F90:1206
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2686
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1403
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5527
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5282
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3750
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:402
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2328
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3336
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2046
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3423
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5306
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:281
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3559
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes, shared_memory_size)
Run a kernel with 4-byte integer sizes.
Definition: accel.F90:1026
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1804
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4877
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1776
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5614
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3169
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3001
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2583
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1965
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4631
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3060
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5638
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3966
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:278
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes, shared_memory_size)
Run a kernel with 8-byte integer sizes.
Definition: accel.F90:986
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2965
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5122
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3091
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3731
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3586
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4955
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:1615
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3889
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1269
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3781
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2736
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4681
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4371
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2832
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1332
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:810
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3218
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3859
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1288
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2020
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4827
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:280
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1946
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1560
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3407
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2441
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4186
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2419
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:780
subroutine accel_check_bandwidth()
Definition: accel.F90:1134
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3994
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2275
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:5192
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1501
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4654
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3199
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4586
subroutine, public accel_finish()
Definition: accel.F90:952
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5090
subroutine accel_kernel_global_init()
Definition: accel.F90:1193
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3276
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2074
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2636
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4518
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5561
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5750
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:920
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1057
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:5598
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3831
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1541
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2104
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3041
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4026
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1996
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4774
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5360
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2352
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4332
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2794
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2813
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1098
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3491
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:944
integer, parameter, public accel_mem_read_write
Definition: accel.F90:182
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2464
subroutine accel_kernel_end(this)
Definition: accel.F90:1251
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:279
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5062
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2931
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3447
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1591
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5428
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1360
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3678
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4542
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1303
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3115
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4096
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1906
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4313
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4431
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4901
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4985
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2655
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:5390
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5705
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4564
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2710
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2871
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2312
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4136
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1227
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:412
subroutine, public accel_end(namespace)
Definition: accel.F90:697
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:4787
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1669
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:3141
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1348
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1315
subroutine, public accel_release_buffer(this, async)
Definition: accel.F90:879
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:5251
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2491
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4399
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:4210
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5682
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4294
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4060
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:801
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1114
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:1870
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5004
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4609
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3927
pure logical function, public accel_is_enabled()
Definition: accel.F90:392
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:3370
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1699
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3805
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1737
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3908
integer, parameter, public accel_mem_write_only
Definition: accel.F90:182
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2899
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2181
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:5156
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:5409
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2123
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:5495
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1641
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2396
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:5777
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4155
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4927
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4846
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:1836
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:5023
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3469
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1375
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4465
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:5467
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1488
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1718
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1044
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2596
type(accel_t), public accel
Definition: accel.F90:248
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:5660
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:4264
integer(int64) pure function, public accel_shared_memory_size()
Definition: accel.F90:1296
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4502
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2142
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:4236
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3691
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3536
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:823
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:5232
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:2241
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:2209
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:964
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1038
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
Definition: accel.F90:5332
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3237
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
Definition: accel.F90:2764
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2374
type(accel_kernel_t), pointer head
Definition: accel.F90:386
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
Definition: accel.F90:3304
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:191
complex(real64), parameter, public m_z0
Definition: global.F90:201
complex(real64), parameter, public m_z1
Definition: global.F90:202
real(real64), parameter, public m_one
Definition: global.F90:192
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1023
subroutine, public messages_new_line()
Definition: messages.F90:1112
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)