30 use iso_c_binding,
only: c_size_t
31 use,
intrinsic :: iso_fortran_env
88 integer,
public,
parameter :: &
89 ACCEL_MEM_READ_ONLY = 0, &
96 type(c_ptr) :: cuda_context
104#if defined(HAVE_CUDA)
105 type(c_ptr) :: cuda_device
113 type(accel_context_t) :: context
114 type(accel_device_t) :: device
115 type(c_ptr) :: cublas_handle
116 type(c_ptr) :: cuda_stream
117 type(c_ptr) :: module_map
118 integer :: max_workgroup_size
119 integer(int64) :: local_memory_size
120 integer(int64) :: global_memory_size
122 logical :: allow_CPU_only
123 logical :: shared_mem
126 integer(int64) :: initialize_buffers
127 character(len=32) :: debug_flag
128 integer(int64) :: max_block_dim(3)
129 integer(int64) :: max_grid_dim(3)
135 integer(c_size_t) :: size = 0
138 logical :: allocated = .false.
144 type(c_ptr) :: cuda_kernel
145 type(c_ptr) :: cuda_module
146 type(c_ptr) :: arguments
148 integer(int64) :: cuda_shared_mem
149 logical :: initialized = .false.
150 type(accel_kernel_t),
pointer :: next
152 character(len=128) :: kernel_name
155 type(accel_t),
public :: accel
158 type(accel_mem_t),
public,
save :: zM_0_buffer, zM_1_buffer
159 type(accel_mem_t),
public,
save :: dM_0_buffer, dM_1_buffer
162 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi
163 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_complex
164 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors
165 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors_complex
166 type(accel_kernel_t),
public,
target,
save :: kernel_daxpy
167 type(accel_kernel_t),
public,
target,
save :: kernel_zaxpy
168 type(accel_kernel_t),
public,
target,
save :: kernel_copy
169 type(accel_kernel_t),
public,
target,
save :: kernel_copy_complex_to_real
170 type(accel_kernel_t),
public,
target,
save :: kernel_copy_real_to_complex
171 type(accel_kernel_t),
public,
target,
save :: dpack
172 type(accel_kernel_t),
public,
target,
save :: zpack
173 type(accel_kernel_t),
public,
target,
save :: dunpack
174 type(accel_kernel_t),
public,
target,
save :: zunpack
175 type(accel_kernel_t),
public,
target,
save :: kernel_ghost_reorder
176 type(accel_kernel_t),
public,
target,
save :: kernel_density_real
177 type(accel_kernel_t),
public,
target,
save :: kernel_density_complex
178 type(accel_kernel_t),
public,
target,
save :: kernel_density_spinors
179 type(accel_kernel_t),
public,
target,
save :: kernel_phase
180 type(accel_kernel_t),
public,
target,
save :: kernel_phase_spiral
181 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
182 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
188 type(accel_kernel_t),
public,
target,
save :: dkernel_batch_dotp
189 type(accel_kernel_t),
public,
target,
save :: zkernel_batch_dotp
190 type(accel_kernel_t),
public,
target,
save :: dzmul
191 type(accel_kernel_t),
public,
target,
save :: zzmul
292 integer :: buffer_alloc_count
293 integer(int64) :: allocated_mem
294 type(accel_kernel_t),
pointer :: head
295 type(alloc_cache_t) :: memcache
301 enabled = accel%enabled
311 allow = accel%allow_CPU_only
320 type(mpi_grp_t),
intent(inout) :: base_grp
321 type(namespace_t),
intent(in) :: namespace
323 logical :: disable, default, run_benchmark
328 character(len=256) :: sys_name
334 buffer_alloc_count = 0
352 accel%enabled = .not. disable
355 if (accel%enabled)
then
356 message(1) =
'Octopus was compiled without Cuda support.'
384 if (idevice < 0)
then
392 if (idevice<0) idevice = 0
393 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
394 idevice, base_grp%rank)
397 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
398 " on ", trim(sys_name)
402 accel%shared_mem = .
true.
404 call cublas_init(accel%cublas_handle, accel%cuda_stream)
415 accel%max_block_dim(1) = int(dim, int64)
417 accel%max_block_dim(2) = int(dim, int64)
419 accel%max_block_dim(3) = int(dim, int64)
421 accel%max_grid_dim(1) = int(dim, int64)
423 accel%max_grid_dim(2) = int(dim, int64)
425 accel%max_grid_dim(3) = int(dim, int64)
431 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
437 accel%debug_flag =
"-g"
438#elif defined(HAVE_CUDA)
439 accel%debug_flag =
"-lineinfo"
449 flags =
' -DRTYPE_DOUBLE')
451 flags =
'-DRTYPE_COMPLEX')
453 flags =
'-DRTYPE_DOUBLE')
455 flags =
'-DRTYPE_COMPLEX')
505 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
509 if (run_benchmark)
then
530 call parse_variable(namespace,
'GPUAwareMPI', default, accel%cuda_mpi)
531 if (accel%cuda_mpi)
then
533 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
549#if defined (HAVE_ACCEL)
554 call parse_variable(namespace,
'AllowCPUonly', default, accel%allow_CPU_only)
570 call parse_variable(namespace,
'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
585 character(kind=c_char) :: cval_str(257)
587 integer :: major, minor
588 character(len=256) :: val_str
597#ifdef __HIP_PLATFORM_AMD__
607#ifdef __HIP_PLATFORM_AMD__
615 cval_str = c_null_char
663 integer(int64) :: hits, misses
664 real(real64) :: volume_hits, volume_misses
680 if (.not. found)
exit
687 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
698 if (hits + misses > 0)
then
705 if (volume_hits + volume_misses > 0)
then
706 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
721 call cublas_end(accel%cublas_handle)
722 if (.not. accel%cuda_mpi)
then
723 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
727 if (buffer_alloc_count /= 0)
then
744 integer(int64),
intent(in) :: nn
746 integer(int64) :: modnn, bsize
755 modnn = mod(nn, bsize)
756 if (modnn /= 0) psize = psize + bsize - modnn
765 integer(int32),
intent(in) :: nn
775 integer,
intent(in) :: flags
776 type(
type_t),
intent(in) :: type
777 integer,
intent(in) :: size
778 logical,
optional,
intent(in) :: set_zero
779 logical,
optional,
intent(in) :: async
788 integer,
intent(in) :: flags
789 type(
type_t),
intent(in) :: type
790 integer(int64),
intent(in) :: size
791 logical,
optional,
intent(in) :: set_zero
792 logical,
optional,
intent(in) :: async
794 integer(int64) :: fsize
796 integer(int64) :: initialize_buffers
804 this%allocated = .
true.
810 if (.not. found)
then
813 call cuda_mem_alloc_async(this%mem, fsize)
820 buffer_alloc_count = buffer_alloc_count + 1
821 allocated_mem = allocated_mem + fsize
825 if (
present(set_zero))
then
826 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
828 initialize_buffers = accel%initialize_buffers
830 select case (initialize_buffers)
831 case (option__initializegpubuffers__yes)
833 case (option__initializegpubuffers__nan)
844 logical,
optional,
intent(in) :: async
847 integer(int64) :: fsize
851 if (this%size > 0)
then
860 call cuda_mem_free_async(this%mem)
867 buffer_alloc_count = buffer_alloc_count - 1
868 allocated_mem = allocated_mem + fsize
875 this%allocated = .false.
885 integer,
intent(in) :: flags
886 type(
type_t),
intent(in) :: type
887 integer,
intent(in) :: required_size
888 logical,
intent(in) :: set_zero
889 logical,
optional,
intent(in) :: async
910 allocated = this%allocated
929 integer,
intent(in) :: narg
945 integer,
intent(in) :: narg
947 integer,
intent(in) :: size
949 integer(int64) :: size_in_bytes
956 if (size_in_bytes > accel%local_memory_size)
then
957 write(
message(1),
'(a,f12.6,a)')
"CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0,
" Kb"
958 write(
message(2),
'(a,f12.6,a)')
" available local memory: ", real(accel%local_memory_size, real64) /1024.0,
" Kb"
960 else if (size_in_bytes <= 0)
then
961 write(
message(1),
'(a,i10)')
"CL Error: invalid local memory size: ", size_in_bytes
966 kernel%cuda_shared_mem = size_in_bytes
976 integer(int64),
intent(in) :: globalsizes(:)
977 integer(int64),
intent(in) :: localsizes(:)
980 integer(int64) :: gsizes(1:3)
981 integer(int64) :: lsizes(1:3)
989 dim = ubound(globalsizes, dim=1)
991 assert(dim == ubound(localsizes, dim=1))
994 if (any(globalsizes == 0))
return
996 assert(all(localsizes > 0))
998 assert(all(mod(globalsizes, localsizes) == 0))
1000 gsizes(1:dim) = globalsizes(1:dim)
1001 lsizes(1:dim) = localsizes(1:dim)
1005 if (any(lsizes(1:3) > accel%max_block_dim(1:3)))
then
1006 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1007 message(2) =
"The following conditions should be fulfilled:"
1008 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", lsizes(1),
" <= ", accel%max_block_dim(1)
1009 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", lsizes(2),
" <= ", accel%max_block_dim(2)
1010 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", lsizes(3),
" <= ", accel%max_block_dim(3)
1011 message(6) =
"This is an internal error, please contact the developers."
1018 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1019 message(2) =
"The following condition should be fulfilled:"
1021 message(4) =
"This is an internal error, please contact the developers."
1025 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1028 if (any(gsizes(1:3) > accel%max_grid_dim(1:3)))
then
1029 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1030 message(2) =
"The following conditions should be fulfilled:"
1031 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ", accel%max_grid_dim(1)
1032 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ", accel%max_grid_dim(2)
1033 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ", accel%max_grid_dim(3)
1034 message(6) =
"This is an internal error, please contact the developers."
1038 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1040 kernel%cuda_shared_mem = 0
1049 integer,
intent(in) :: globalsizes(:)
1050 integer,
intent(in) :: localsizes(:)
1059 max_workgroup_size = accel%max_workgroup_size
1068 integer :: max_workgroup_size
1074 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1075 if (debug%info .and. max_workgroup_size /=
accel%max_workgroup_size)
then
1076 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", workgroup_size,
")", &
1077 "than available on the device (",
accel%max_workgroup_size,
")"
1078 call messages_info(1)
1082 workgroup_size = 256
1084 workgroup_size = min(workgroup_size, max_workgroup_size)
1093 type(type_t),
intent(in) :: type
1094 integer(int8),
intent(in) :: val
1095 integer(int64),
intent(in) :: nval
1096 integer(int64),
optional,
intent(in) :: offset
1097 logical,
optional,
intent(in) :: async
1099 integer(int64) :: nval_, offset_, type_size
1109 if (
present(offset))
then
1111 if(offset > buffer%size)
then
1117 type_size = types_get_size(type)
1119 nval_ = nval*type_size
1122 if (
present(offset)) offset_ = offset*type_size
1124 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1125 if(.not. optional_default(async, .false.))
call accel_finish()
1134 type(type_t),
intent(in) :: type
1135 integer(int64),
intent(in) :: nval
1136 integer(int64),
optional,
intent(in) :: offset
1137 logical,
optional,
intent(in) :: async
1150 type(type_t),
intent(in) :: type
1151 integer(int32),
intent(in) :: nval
1152 integer(int32),
optional,
intent(in) :: offset
1153 logical,
optional,
intent(in) :: async
1157 if (
present(offset))
then
1170 integer,
parameter :: times = 10
1172 real(real64) :: time, stime
1173 real(real64) :: read_bw, write_bw
1175 real(real64),
allocatable :: data(:)
1177 call messages_new_line()
1178 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
1179 call messages_new_line()
1180 call messages_info()
1182 call messages_write(
' Buffer size Read bw Write bw')
1183 call messages_new_line()
1184 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
1185 call messages_info()
1189 safe_allocate(
data(1:size))
1192 stime = loct_clock()
1197 time = (loct_clock() - stime)/real(times, real64)
1199 write_bw = real(
size, real64) *8.0_real64/time
1201 stime = loct_clock()
1207 time = (loct_clock() - stime)/real(times, real64)
1208 read_bw = real(
size, real64) *8.0_real64/time
1210 call messages_write(size*8.0_real64/1024.0_real64**2)
1211 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
1212 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
1213 call messages_info()
1217 safe_deallocate_a(data)
1219 size = int(size*2.0)
1221 if (
size > 50000000)
exit
1227 logical pure function accel_use_shared_mem() result(use_shared_mem)
1229 use_shared_mem =
accel%shared_mem
1241 call cuda_module_map_init(
accel%module_map)
1255 next_head =>
head%next
1261 call cuda_module_map_end(
accel%module_map)
1271 character(len=*),
intent(in) :: file_name
1272 character(len=*),
intent(in) :: kernel_name
1273 character(len=*),
optional,
intent(in) :: flags
1276 character(len=1000) :: all_flags
1281 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
1284 all_flags =
'-I'//trim(conf%share)//
'/kernels/'//
" "//trim(
accel%debug_flag)
1287 all_flags = trim(all_flags)//
' -DSHARED_MEM'
1290 if (
present(flags))
then
1291 all_flags = trim(all_flags)//
' '//trim(flags)
1294 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, &
1295 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1297 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1298 call cuda_alloc_arg_array(this%arguments)
1300 this%cuda_shared_mem = 0
1303 this%initialized = .
true.
1304 this%kernel_name = trim(kernel_name)
1306 call profiling_out(
"ACCEL_COMPILE")
1319 call cuda_free_arg_array(this%arguments)
1320 call cuda_release_kernel(this%cuda_kernel)
1324 this%initialized = .false.
1333 character(len=*),
intent(in) :: file_name
1334 character(len=*),
intent(in) :: kernel_name
1335 character(len=*),
optional,
intent(in) :: flags
1339 if (.not. this%initialized)
then
1352 size =
accel%global_memory_size
1367 integer,
intent(in) :: dim
1372 if (dim == 1)
size = 2**30
1379 integer,
intent(in) :: stream_number
1385 call cuda_set_stream(
accel%cuda_stream, stream_number)
1386 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
1396 integer,
intent(inout) :: stream_number
1402 call cuda_get_stream(stream_number)
1416 call cuda_synchronize_all_streams()
1424 type(c_ptr),
intent(in) :: buffer
1425 integer(int64),
intent(in) :: offset
1426 type(c_ptr) :: buffer_offset
1430 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1433 buffer_offset = buffer
1439 type(c_ptr),
intent(in) :: buffer
1440 integer(int64),
intent(in) :: offset
1441 type(c_ptr) :: buffer_offset
1445 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1448 buffer_offset = buffer
1454 type(c_ptr),
intent(in) :: buffer
1458 call cuda_clean_pointer(buffer)
1467 integer(int64),
intent(in) :: size
1468 integer(int64),
intent(out) :: grid_size
1469 integer(int64),
intent(out) :: thread_block_size
1472#ifdef __HIP_PLATFORM_AMD__
1475 thread_block_size =
size
1477 grid_size =
size *
accel%warp_size
1478 thread_block_size =
accel%warp_size
1485#include "accel_inc.F90"
1488#include "complex.F90"
1489#include "accel_inc.F90"
1492#include "integer.F90"
1493#include "accel_inc.F90"
1496#include "integer8.F90"
1497#include "accel_inc.F90"
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), target, save, public dkernel_batch_axpy
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
subroutine laccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
pure logical function, public accel_allow_cpu_only()
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
logical pure function, public accel_use_shared_mem()
subroutine daccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine daccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
subroutine zaccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
subroutine zaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine, public accel_get_stream(stream_number)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int64) pure function, public accel_global_memory_size()
subroutine daccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_dot_matrix_spinors
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer(int64) function accel_padded_size_i8(nn)
subroutine accel_check_bandwidth()
subroutine iaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine laccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine accel_kernel_global_init()
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
subroutine iaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine laccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine accel_kernel_end(this)
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
subroutine zaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine zaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine, public accel_synchronize_all_streams()
subroutine, public accel_set_stream(stream_number)
subroutine, public accel_release_buffer(this, async)
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine iaccel_read_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine daccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
pure logical function, public accel_is_enabled()
subroutine zaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_write_buffer_4(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
integer, parameter, public accel_mem_write_only
subroutine zaccel_write_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine laccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine daccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_write_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_read_buffer_6_int32(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
integer function, public accel_kernel_workgroup_size(kernel)
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_5_int32(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine daccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
integer(int64) pure function, public accel_local_memory_size()
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
integer pure function, public accel_max_workgroup_size()
subroutine laccel_read_buffer_5(this, n1, n2, n3, n4, n5, data, offset, async)
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_6(this, n1, n2, n3, n4, n5, n6, data, offset, async)
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine zaccel_read_buffer_4_int32(this, n1, n2, n3, n4, data, offset, async)
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
complex(real64), parameter, public m_z0
complex(real64), parameter, public m_z1
real(real64), parameter, public m_one
System information (time, memory, sysname)
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
subroutine, public loct_sysname(name)
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_input_error(namespace, var, details, row, column)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
type(type_t), public type_float
type(type_t), public type_cmplx
integer pure function, public types_get_size(this)
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)