30 use iso_c_binding,
only: c_size_t
31 use,
intrinsic :: iso_fortran_env
88 integer,
public,
parameter :: &
89 ACCEL_MEM_READ_ONLY = 0, &
96 type(c_ptr) :: cuda_context
104#if defined(HAVE_CUDA)
105 type(c_ptr) :: cuda_device
113 type(accel_context_t) :: context
114 type(accel_device_t) :: device
115 type(c_ptr) :: cublas_handle
116 type(c_ptr) :: cuda_stream
117 type(c_ptr) :: module_map
118 integer :: max_workgroup_size
119 integer(int64) :: local_memory_size
120 integer(int64) :: global_memory_size
122 logical :: allow_CPU_only
123 logical :: shared_mem
126 integer(int64) :: initialize_buffers
127 character(len=32) :: debug_flag
128 integer(int64) :: max_block_dim(3)
129 integer(int64) :: max_grid_dim(3)
135 integer(c_size_t) :: size = 0
138 logical :: allocated = .false.
144 type(c_ptr) :: cuda_kernel
145 type(c_ptr) :: cuda_module
146 type(c_ptr) :: arguments
148 integer(int64) :: cuda_shared_mem
149 logical :: initialized = .false.
150 type(accel_kernel_t),
pointer :: next
152 character(len=128) :: kernel_name
155 type(accel_t),
public :: accel
158 type(accel_mem_t),
public,
save :: zM_0_buffer, zM_1_buffer
159 type(accel_mem_t),
public,
save :: dM_0_buffer, dM_1_buffer
162 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi
163 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_complex
164 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors
165 type(accel_kernel_t),
public,
target,
save :: kernel_vpsi_spinors_complex
166 type(accel_kernel_t),
public,
target,
save :: kernel_daxpy
167 type(accel_kernel_t),
public,
target,
save :: kernel_zaxpy
168 type(accel_kernel_t),
public,
target,
save :: kernel_copy
169 type(accel_kernel_t),
public,
target,
save :: kernel_copy_complex_to_real
170 type(accel_kernel_t),
public,
target,
save :: kernel_copy_real_to_complex
171 type(accel_kernel_t),
public,
target,
save :: dpack
172 type(accel_kernel_t),
public,
target,
save :: zpack
173 type(accel_kernel_t),
public,
target,
save :: dunpack
174 type(accel_kernel_t),
public,
target,
save :: zunpack
175 type(accel_kernel_t),
public,
target,
save :: kernel_ghost_reorder
176 type(accel_kernel_t),
public,
target,
save :: kernel_density_real
177 type(accel_kernel_t),
public,
target,
save :: kernel_density_complex
178 type(accel_kernel_t),
public,
target,
save :: kernel_density_spinors
179 type(accel_kernel_t),
public,
target,
save :: kernel_phase
180 type(accel_kernel_t),
public,
target,
save :: kernel_phase_spiral
181 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
182 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
188 type(accel_kernel_t),
public,
target,
save :: dkernel_batch_dotp
189 type(accel_kernel_t),
public,
target,
save :: zkernel_batch_dotp
190 type(accel_kernel_t),
public,
target,
save :: dzmul
191 type(accel_kernel_t),
public,
target,
save :: zzmul
286 enabled = accel%enabled
296 allow = accel%allow_CPU_only
305 type(mpi_grp_t),
intent(inout) :: base_grp
306 type(namespace_t),
intent(in) :: namespace
308 logical :: disable, default, run_benchmark
313 character(len=256) :: sys_name
337 accel%enabled = .not. disable
340 if (accel%enabled)
then
341 message(1) =
'Octopus was compiled without Cuda support.'
369 if (idevice < 0)
then
377 if (idevice<0) idevice = 0
378 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
379 idevice, base_grp%rank)
382 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
383 " on ", trim(sys_name)
387 accel%shared_mem = .
true.
389 call cublas_init(accel%cublas_handle, accel%cuda_stream)
400 accel%max_block_dim(1) = int(dim, int64)
402 accel%max_block_dim(2) = int(dim, int64)
404 accel%max_block_dim(3) = int(dim, int64)
406 accel%max_grid_dim(1) = int(dim, int64)
408 accel%max_grid_dim(2) = int(dim, int64)
410 accel%max_grid_dim(3) = int(dim, int64)
422 accel%debug_flag =
"-g"
423#elif defined(HAVE_CUDA)
424 accel%debug_flag =
"-lineinfo"
434 flags =
' -DRTYPE_DOUBLE')
436 flags =
'-DRTYPE_COMPLEX')
438 flags =
'-DRTYPE_DOUBLE')
440 flags =
'-DRTYPE_COMPLEX')
490 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
494 if (run_benchmark)
then
515 call parse_variable(namespace,
'GPUAwareMPI', default, accel%cuda_mpi)
516 if (accel%cuda_mpi)
then
518 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
534#if defined (HAVE_ACCEL)
539 call parse_variable(namespace,
'AllowCPUonly', default, accel%allow_CPU_only)
555 call parse_variable(namespace,
'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
570 character(kind=c_char) :: cval_str(257)
572 integer :: major, minor
573 character(len=256) :: val_str
582#ifdef __HIP_PLATFORM_AMD__
592#ifdef __HIP_PLATFORM_AMD__
600 cval_str = c_null_char
648 integer(int64) :: hits, misses
649 real(real64) :: volume_hits, volume_misses
665 if (.not. found)
exit
683 if (hits + misses > 0)
then
690 if (volume_hits + volume_misses > 0)
then
691 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
706 call cublas_end(accel%cublas_handle)
707 if (.not. accel%cuda_mpi)
then
708 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
729 integer(int64),
intent(in) :: nn
731 integer(int64) :: modnn, bsize
740 modnn = mod(nn, bsize)
741 if (modnn /= 0) psize = psize + bsize - modnn
750 integer(int32),
intent(in) :: nn
760 integer,
intent(in) :: flags
761 type(
type_t),
intent(in) :: type
762 integer,
intent(in) :: size
763 logical,
optional,
intent(in) :: set_zero
764 logical,
optional,
intent(in) :: async
773 integer,
intent(in) :: flags
774 type(
type_t),
intent(in) :: type
775 integer(int64),
intent(in) :: size
776 logical,
optional,
intent(in) :: set_zero
777 logical,
optional,
intent(in) :: async
779 integer(int64) :: fsize
781 integer(int64) :: initialize_buffers
789 this%allocated = .
true.
795 if (.not. found)
then
798 call cuda_mem_alloc_async(this%mem, fsize)
810 if (
present(set_zero))
then
811 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
813 initialize_buffers = accel%initialize_buffers
815 select case (initialize_buffers)
816 case (option__initializegpubuffers__yes)
818 case (option__initializegpubuffers__nan)
829 logical,
optional,
intent(in) :: async
832 integer(int64) :: fsize
836 if (this%size > 0)
then
845 call cuda_mem_free_async(this%mem)
860 this%allocated = .false.
870 integer,
intent(in) :: flags
871 type(
type_t),
intent(in) :: type
872 integer,
intent(in) :: required_size
873 logical,
intent(in) :: set_zero
874 logical,
optional,
intent(in) :: async
895 allocated = this%allocated
914 integer,
intent(in) :: narg
930 integer,
intent(in) :: narg
932 integer,
intent(in) :: size
934 integer(int64) :: size_in_bytes
941 if (size_in_bytes > accel%local_memory_size)
then
942 write(
message(1),
'(a,f12.6,a)')
"CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0,
" Kb"
943 write(
message(2),
'(a,f12.6,a)')
" available local memory: ", real(accel%local_memory_size, real64) /1024.0,
" Kb"
945 else if (size_in_bytes <= 0)
then
946 write(
message(1),
'(a,i10)')
"CL Error: invalid local memory size: ", size_in_bytes
951 kernel%cuda_shared_mem = size_in_bytes
961 integer(int64),
intent(in) :: globalsizes(:)
962 integer(int64),
intent(in) :: localsizes(:)
965 integer(int64) :: gsizes(1:3)
966 integer(int64) :: lsizes(1:3)
974 dim = ubound(globalsizes, dim=1)
976 assert(dim == ubound(localsizes, dim=1))
979 if (any(globalsizes == 0))
return
981 assert(all(localsizes > 0))
983 assert(all(mod(globalsizes, localsizes) == 0))
985 gsizes(1:dim) = globalsizes(1:dim)
986 lsizes(1:dim) = localsizes(1:dim)
990 if (any(lsizes(1:3) > accel%max_block_dim(1:3)))
then
991 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
992 message(2) =
"The following conditions should be fulfilled:"
993 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", lsizes(1),
" <= ", accel%max_block_dim(1)
994 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", lsizes(2),
" <= ", accel%max_block_dim(2)
995 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", lsizes(3),
" <= ", accel%max_block_dim(3)
996 message(6) =
"This is an internal error, please contact the developers."
1003 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1004 message(2) =
"The following condition should be fulfilled:"
1006 message(4) =
"This is an internal error, please contact the developers."
1010 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1013 if (any(gsizes(1:3) > accel%max_grid_dim(1:3)))
then
1014 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1015 message(2) =
"The following conditions should be fulfilled:"
1016 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ", accel%max_grid_dim(1)
1017 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ", accel%max_grid_dim(2)
1018 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ", accel%max_grid_dim(3)
1019 message(6) =
"This is an internal error, please contact the developers."
1023 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1025 kernel%cuda_shared_mem = 0
1034 integer,
intent(in) :: globalsizes(:)
1035 integer,
intent(in) :: localsizes(:)
1044 max_workgroup_size = accel%max_workgroup_size
1053 integer :: max_workgroup_size
1059 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1060 if (debug%info .and. max_workgroup_size /=
accel%max_workgroup_size)
then
1061 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", workgroup_size,
")", &
1062 "than available on the device (",
accel%max_workgroup_size,
")"
1063 call messages_info(1)
1067 workgroup_size = 256
1069 workgroup_size = min(workgroup_size, max_workgroup_size)
1078 type(type_t),
intent(in) :: type
1079 integer(int8),
intent(in) :: val
1080 integer(int64),
intent(in) :: nval
1081 integer(int64),
optional,
intent(in) :: offset
1082 logical,
optional,
intent(in) :: async
1084 integer(int64) :: nval_, offset_, type_size
1094 if (
present(offset))
then
1096 if(offset > buffer%size)
then
1102 type_size = types_get_size(type)
1104 nval_ = nval*type_size
1107 if (
present(offset)) offset_ = offset*type_size
1109 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1110 if(.not. optional_default(async, .false.))
call accel_finish()
1119 type(type_t),
intent(in) :: type
1120 integer(int64),
intent(in) :: nval
1121 integer(int64),
optional,
intent(in) :: offset
1122 logical,
optional,
intent(in) :: async
1135 type(type_t),
intent(in) :: type
1136 integer(int32),
intent(in) :: nval
1137 integer(int32),
optional,
intent(in) :: offset
1138 logical,
optional,
intent(in) :: async
1142 if (
present(offset))
then
1155 integer,
parameter :: times = 10
1157 real(real64) :: time, stime
1158 real(real64) :: read_bw, write_bw
1160 real(real64),
allocatable :: data(:)
1162 call messages_new_line()
1163 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
1164 call messages_new_line()
1165 call messages_info()
1167 call messages_write(
' Buffer size Read bw Write bw')
1168 call messages_new_line()
1169 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
1170 call messages_info()
1174 safe_allocate(
data(1:size))
1177 stime = loct_clock()
1182 time = (loct_clock() - stime)/real(times, real64)
1184 write_bw = real(
size, real64) *8.0_real64/time
1186 stime = loct_clock()
1192 time = (loct_clock() - stime)/real(times, real64)
1193 read_bw = real(
size, real64) *8.0_real64/time
1195 call messages_write(size*8.0_real64/1024.0_real64**2)
1196 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
1197 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
1198 call messages_info()
1202 safe_deallocate_a(data)
1204 size = int(size*2.0)
1206 if (
size > 50000000)
exit
1212 logical pure function accel_use_shared_mem() result(use_shared_mem)
1214 use_shared_mem =
accel%shared_mem
1226 call cuda_module_map_init(
accel%module_map)
1240 next_head =>
head%next
1246 call cuda_module_map_end(
accel%module_map)
1256 character(len=*),
intent(in) :: file_name
1257 character(len=*),
intent(in) :: kernel_name
1258 character(len=*),
optional,
intent(in) :: flags
1261 character(len=1000) :: all_flags
1266 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
1269 all_flags =
'-I'//trim(conf%share)//
'/kernels/'//
" "//trim(
accel%debug_flag)
1272 all_flags = trim(all_flags)//
' -DSHARED_MEM'
1275 if (
present(flags))
then
1276 all_flags = trim(all_flags)//
' '//trim(flags)
1279 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, &
1280 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1282 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1283 call cuda_alloc_arg_array(this%arguments)
1285 this%cuda_shared_mem = 0
1288 this%initialized = .
true.
1289 this%kernel_name = trim(kernel_name)
1291 call profiling_out(
"ACCEL_COMPILE")
1304 call cuda_free_arg_array(this%arguments)
1305 call cuda_release_kernel(this%cuda_kernel)
1309 this%initialized = .false.
1318 character(len=*),
intent(in) :: file_name
1319 character(len=*),
intent(in) :: kernel_name
1320 character(len=*),
optional,
intent(in) :: flags
1324 if (.not. this%initialized)
then
1337 size =
accel%global_memory_size
1352 integer,
intent(in) :: dim
1357 if (dim == 1)
size = 2**30
1364 integer,
intent(in) :: stream_number
1370 call cuda_set_stream(
accel%cuda_stream, stream_number)
1371 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
1381 integer,
intent(inout) :: stream_number
1387 call cuda_get_stream(stream_number)
1401 call cuda_synchronize_all_streams()
1409 type(c_ptr),
intent(in) :: buffer
1410 integer(int64),
intent(in) :: offset
1411 type(c_ptr) :: buffer_offset
1415 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1418 buffer_offset = buffer
1424 type(c_ptr),
intent(in) :: buffer
1425 integer(int64),
intent(in) :: offset
1426 type(c_ptr) :: buffer_offset
1430 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1433 buffer_offset = buffer
1439 type(c_ptr),
intent(in) :: buffer
1443 call cuda_clean_pointer(buffer)
1452 integer(int64),
intent(in) :: size
1453 integer(int64),
intent(out) :: grid_size
1454 integer(int64),
intent(out) :: thread_block_size
1457#ifdef __HIP_PLATFORM_AMD__
1460 thread_block_size =
size
1462 grid_size =
size *
accel%warp_size
1463 thread_block_size =
accel%warp_size
1470#include "accel_inc.F90"
1473#include "complex.F90"
1474#include "accel_inc.F90"
1477#include "integer.F90"
1478#include "accel_inc.F90"
1481#include "integer8.F90"
1482#include "accel_inc.F90"
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), target, save, public dkernel_batch_axpy
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
pure logical function, public accel_allow_cpu_only()
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
logical pure function, public accel_use_shared_mem()
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
subroutine zaccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
integer buffer_alloc_count
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine, public accel_get_stream(stream_number)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
integer(int64) pure function, public accel_global_memory_size()
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_dot_matrix_spinors
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer(int64) function accel_padded_size_i8(nn)
subroutine accel_check_bandwidth()
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine accel_kernel_global_init()
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
integer(int64) allocated_mem
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
subroutine accel_kernel_end(this)
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
subroutine, public accel_synchronize_all_streams()
subroutine, public accel_set_stream(stream_number)
subroutine, public accel_release_buffer(this, async)
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
pure logical function, public accel_is_enabled()
type(alloc_cache_t) memcache
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
integer, parameter, public accel_mem_write_only
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
integer function, public accel_kernel_workgroup_size(kernel)
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
integer(int64) pure function, public accel_local_memory_size()
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
integer pure function, public accel_max_workgroup_size()
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
complex(real64), parameter, public m_z0
complex(real64), parameter, public m_z1
real(real64), parameter, public m_one
System information (time, memory, sysname)
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
subroutine, public loct_sysname(name)
This module is intended to contain "only mathematical" functions and procedures.
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_input_error(namespace, var, details, row, column)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
type(type_t), public type_float
type(type_t), public type_cmplx
integer pure function, public types_get_size(this)
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)