21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
43 use iso_c_binding,
only: c_size_t
44 use,
intrinsic :: iso_fortran_env
98 integer,
public,
parameter :: &
99 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
103 integer,
public,
parameter :: &
104 ACCEL_MEM_READ_ONLY = 0, &
112 type(cl_context) :: cl_context
113#elif defined(HAVE_CUDA)
114 type(c_ptr) :: cuda_context
123 type(cl_device_id) :: cl_device
124#elif defined(HAVE_CUDA)
125 type(c_ptr) :: cuda_device
133 type(accel_context_t) :: context
134 type(accel_device_t) :: device
136 type(cl_command_queue) :: command_queue
138 type(c_ptr) :: cublas_handle
139 type(c_ptr) :: cuda_stream
140 type(c_ptr) :: module_map
141 integer :: max_workgroup_size
142 integer(int64) :: local_memory_size
143 integer(int64) :: global_memory_size
145 logical :: allow_CPU_only
146 logical :: shared_mem
149 logical :: initialize_buffers
150 character(len=32) :: debug_flag
151 integer(int64) :: max_block_dim(3)
152 integer(int64) :: max_grid_dim(3)
162 integer(c_size_t) :: size = 0
165 logical :: allocated = .false.
171 type(cl_kernel) :: kernel
174 type(c_ptr) :: cuda_kernel
175 type(c_ptr) :: cuda_module
176 type(c_ptr) :: arguments
178 integer(int64) :: cuda_shared_mem
179 logical :: initialized = .false.
180 type(accel_kernel_t),
pointer :: next
182 character(len=128) :: kernel_name
194 type(accel_kernel_t),
public,
target,
save :: kernel_copy
195 type(accel_kernel_t),
public,
target,
save :: dpack
196 type(accel_kernel_t),
public,
target,
save :: zpack
205 type(accel_kernel_t),
public,
target,
save :: dkernel_dot_matrix
206 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix
207 type(accel_kernel_t),
public,
target,
save :: zkernel_dot_matrix_spinors
216 type(accel_kernel_t),
public,
target,
save :: set_one
290 integer,
parameter :: &
297 integer,
parameter :: &
307 integer :: buffer_alloc_count
308 integer(int64) :: allocated_mem
309 type(accel_kernel_t),
pointer :: head
310 type(alloc_cache_t) :: memcache
316 enabled =
accel%enabled
326 allow =
accel%allow_CPU_only
335 type(mpi_grp_t),
intent(inout) :: base_grp
336 type(namespace_t),
intent(in) :: namespace
338 logical :: disable, default, run_benchmark
339 integer :: idevice, iplatform
341 integer :: device_type
342 integer :: cl_status, idev
343 integer :: ndevices, ret_devices, nplatforms, iplat
344 character(len=256) :: device_name
345 type(cl_platform_id) :: platform_id
346 type(cl_program) :: prog
347 type(cl_platform_id),
allocatable :: allplatforms(:)
348 type(cl_device_id),
allocatable :: alldevices(:)
353 character(len=256) :: sys_name
359 buffer_alloc_count = 0
377 accel%enabled = .not. disable
380 if (
accel%enabled)
then
381 message(1) =
'Octopus was compiled without OpenCL or Cuda support.'
448 if (idevice<0) idevice = 0
450 idevice, base_grp%rank)
453 write(
message(1),
'(A,I5,A,I5,2A)')
"Rank ", base_grp%rank,
" uses device number ", idevice, &
454 " on ", trim(sys_name)
461 call cublas_init(
accel%cublas_handle,
accel%cuda_stream)
467 call clgetplatformids(nplatforms, cl_status)
470 safe_allocate(allplatforms(1:nplatforms))
472 call clgetplatformids(allplatforms, iplat, cl_status)
479 do iplat = 1, nplatforms
481 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
483 if (iplatform < 0)
then
487 if (iplatform == iplat - 1)
then
495 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
502 if (iplatform >= nplatforms .or. iplatform < 0)
then
504 if (iplatform > 0)
then
512 platform_id = allplatforms(iplatform + 1)
514 safe_deallocate_a(allplatforms)
516 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
522 safe_allocate(alldevices(1:ndevices))
526 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
528 do idev = 1, ndevices
531 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
536 select case (idevice)
538 device_type = cl_device_type_gpu
540 device_type = cl_device_type_cpu
542 device_type = cl_device_type_accelerator
544 device_type = cl_device_type_default
546 device_type = cl_device_type_all
550 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
552 if (ret_devices < 1)
then
554 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
556 if (ret_devices < 1)
then
558 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
561 if (ret_devices < 1)
then
568 ndevices = ret_devices
570 if (idevice < 0)
then
571 if (base_grp%size > 1)
then
580 if (idevice >= ndevices)
then
581 call messages_write(
'Requested CL device does not exist (device = ')
589 accel%device%cl_device = alldevices(idevice + 1)
592 accel%context%cl_context = clcreatecontext(platform_id,
accel%device%cl_device, cl_status)
595 safe_deallocate_a(alldevices)
597 accel%command_queue = clcreatecommandqueue(
accel%context%cl_context,
accel%device%cl_device, &
598 cl_queue_profiling_enable, cl_status)
601 call clgetdeviceinfo(
accel%device%cl_device, cl_device_type, device_type, cl_status)
603 select case (device_type)
604 case (cl_device_type_gpu)
606 case (cl_device_type_cpu, cl_device_type_accelerator)
607 accel%shared_mem = .false.
609 accel%shared_mem = .false.
618 call clfftsetup(cl_status)
629 call clgetdeviceinfo(
accel%device%cl_device, cl_device_global_mem_size,
accel%global_memory_size, cl_status)
630 call clgetdeviceinfo(
accel%device%cl_device, cl_device_local_mem_size,
accel%local_memory_size, cl_status)
631 call clgetdeviceinfo(
accel%device%cl_device, cl_device_max_work_group_size,
accel%max_workgroup_size, cl_status)
640 accel%max_block_dim(1) = int(dim, int64)
642 accel%max_block_dim(2) = int(dim, int64)
644 accel%max_block_dim(3) = int(dim, int64)
646 accel%max_grid_dim(1) = int(dim, int64)
648 accel%max_grid_dim(2) = int(dim, int64)
650 accel%max_grid_dim(3) = int(dim, int64)
662 accel%debug_flag =
"-g"
663#elif defined(HAVE_CUDA)
664 accel%debug_flag =
"-lineinfo"
665#elif defined(HAVE_OPENCL)
666 accel%debug_flag =
"-g"
678 flags =
' -DRTYPE_DOUBLE')
680 flags =
'-DRTYPE_COMPLEX')
682 flags =
'-DRTYPE_DOUBLE')
684 flags =
'-DRTYPE_COMPLEX')
713 call parse_variable(namespace,
'AccelBenchmark', .false., run_benchmark)
717 if (run_benchmark)
then
739 if (
accel%cuda_mpi)
then
741 call messages_write(
"Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
757#if defined (HAVE_ACCEL)
780#if defined(HAVE_OPENCL)
782 integer,
intent(inout) :: idevice
784 character(len=256) :: device_name
788 idevice = mod(base_grp%rank, ndevices)
790 call base_grp%barrier()
793 do irank = 0, base_grp%size - 1
794 if (irank == base_grp%rank)
then
795 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
803 call base_grp%barrier()
812 integer(int64) :: val
817 integer :: major, minor
818 character(len=256) :: val_str
830#ifdef __HIP_PLATFORM_AMD__
840#ifdef __HIP_PLATFORM_AMD__
848 call clgetdeviceinfo(
accel%device%cl_device, cl_device_type, val, cl_status)
850 select case (int(val, int32))
851 case (cl_device_type_gpu)
853 case (cl_device_type_cpu)
855 case (cl_device_type_accelerator)
860 call clgetdeviceinfo(
accel%device%cl_device, cl_device_vendor, val_str, cl_status)
866 call clgetdeviceinfo(
accel%device%cl_device, cl_device_name, val_str, cl_status)
885 call clgetdeviceinfo(
accel%device%cl_device, cl_driver_version, val_str, cl_status)
897 call clgetdeviceinfo(
accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
902 call clgetdeviceinfo(
accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
919 call clgetdeviceinfo(
accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
924 call clgetdeviceinfo(
accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
929 call clgetdeviceinfo(
accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
966 character(len=*),
intent(in) :: platform_name
969 if (index(platform_name,
'AMD') > 0) platform_id =
cl_plat_amd
970 if (index(platform_name,
'ATI') > 0) platform_id =
cl_plat_ati
971 if (index(platform_name,
'NVIDIA') > 0) platform_id =
cl_plat_nvidia
972 if (index(platform_name,
'Intel') > 0) platform_id =
cl_plat_intel
983 integer(int64) :: hits, misses
984 real(real64) :: volume_hits, volume_misses
994 if (.not. found)
exit
997 call clreleasememobject(tmp%mem, ierr)
1005 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1016 if (hits + misses > 0)
then
1017 call messages_write(hits/real(hits + misses, real64)*100, fmt=
'(f6.1)', align_left = .
true.)
1023 if (volume_hits + volume_misses > 0)
then
1024 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt=
'(f6.1)', align_left = .
true.)
1042 call clfftteardown()
1047 call cublas_end(
accel%cublas_handle)
1048 if (.not.
accel%cuda_mpi)
then
1054 call clreleasecommandqueue(
accel%command_queue, ierr)
1060 if (buffer_alloc_count /= 0)
then
1077 integer(int64),
intent(in) :: nn
1079 integer(int64) :: modnn, bsize
1088 modnn = mod(nn, bsize)
1089 if (modnn /= 0) psize = psize + bsize - modnn
1098 integer(int32),
intent(in) :: nn
1108 integer,
intent(in) :: flags
1109 type(
type_t),
intent(in) :: type
1110 integer,
intent(in) :: size
1111 logical,
optional,
intent(in) ::
set_zero
1120 integer,
intent(in) :: flags
1121 type(
type_t),
intent(in) :: type
1122 integer(int64),
intent(in) :: size
1123 logical,
optional,
intent(in) ::
set_zero
1125 integer(int64) :: fsize
1137 this%allocated = .
true.
1143 if (.not. found)
then
1145 this%mem = clcreatebuffer(
accel%context%cl_context, flags, fsize, ierr)
1153 buffer_alloc_count = buffer_alloc_count + 1
1154 allocated_mem = allocated_mem + fsize
1174 integer(int64) :: fsize
1178 if (this%size > 0)
then
1186 call clreleasememobject(this%mem, ierr)
1194 buffer_alloc_count = buffer_alloc_count - 1
1195 allocated_mem = allocated_mem + fsize
1202 this%allocated = .false.
1209 logical pure function accel_buffer_is_allocated(this) result(allocated)
1212 allocated = this%allocated
1226 call clfinish(
accel%command_queue, ierr)
1239 integer,
intent(in) :: narg
1250 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1264 integer,
intent(in) :: narg
1265 type(
type_t),
intent(in) :: type
1266 integer,
intent(in) :: size
1271 integer(int64) :: size_in_bytes
1278 if (size_in_bytes >
accel%local_memory_size)
then
1279 write(
message(1),
'(a,f12.6,a)')
"CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0,
" Kb"
1280 write(
message(2),
'(a,f12.6,a)')
" available local memory: ", real(
accel%local_memory_size, real64) /1024.0,
" Kb"
1282 else if (size_in_bytes <= 0)
then
1283 write(
message(1),
'(a,i10)')
"CL Error: invalid local memory size: ", size_in_bytes
1288 kernel%cuda_shared_mem = size_in_bytes
1292 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1303 integer(int64),
intent(in) :: globalsizes(:)
1304 integer(int64),
intent(in) :: localsizes(:)
1310 integer(int64) :: gsizes(1:3)
1311 integer(int64) :: lsizes(1:3)
1319 dim = ubound(globalsizes, dim=1)
1321 assert(dim == ubound(localsizes, dim=1))
1324 if (any(globalsizes == 0))
return
1326 assert(all(localsizes > 0))
1328 assert(all(mod(globalsizes, localsizes) == 0))
1330 gsizes(1:dim) = globalsizes(1:dim)
1331 lsizes(1:dim) = localsizes(1:dim)
1334 call clenqueuendrangekernel(
accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1340 if (any(lsizes(1:3) >
accel%max_block_dim(1:3)))
then
1341 message(1) =
"Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1342 message(2) =
"The following conditions should be fulfilled:"
1343 write(
message(3),
"(A, I8, A, I8)")
"Dim 1: ", lsizes(1),
" <= ",
accel%max_block_dim(1)
1344 write(
message(4),
"(A, I8, A, I8)")
"Dim 2: ", lsizes(2),
" <= ",
accel%max_block_dim(2)
1345 write(
message(5),
"(A, I8, A, I8)")
"Dim 3: ", lsizes(3),
" <= ",
accel%max_block_dim(3)
1346 message(6) =
"This is an internal error, please contact the developers."
1353 message(1) =
"Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1354 message(2) =
"The following condition should be fulfilled:"
1356 message(4) =
"This is an internal error, please contact the developers."
1360 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1363 if (any(gsizes(1:3) >
accel%max_grid_dim(1:3)))
then
1364 message(1) =
"Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1365 message(2) =
"The following conditions should be fulfilled:"
1366 write(
message(3),
"(A, I8, A, I10)")
"Dim 1: ", gsizes(1),
" <= ",
accel%max_grid_dim(1)
1367 write(
message(4),
"(A, I8, A, I10)")
"Dim 2: ", gsizes(2),
" <= ",
accel%max_grid_dim(2)
1368 write(
message(5),
"(A, I8, A, I10)")
"Dim 3: ", gsizes(3),
" <= ",
accel%max_grid_dim(3)
1369 message(6) =
"This is an internal error, please contact the developers."
1373 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1375 kernel%cuda_shared_mem = 0
1384 integer,
intent(in) :: globalsizes(:)
1385 integer,
intent(in) :: localsizes(:)
1393 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1394 max_workgroup_size =
accel%max_workgroup_size
1403 integer(int64) :: workgroup_size8
1407 integer :: max_workgroup_size
1413 call clgetkernelworkgroupinfo(kernel%kernel,
accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1415 workgroup_size = workgroup_size8
1419 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1420 if (debug%info .and. max_workgroup_size /=
accel%max_workgroup_size)
then
1421 write(message(1),
"(A, I5, A)")
"A kernel can use only less threads per block (", workgroup_size,
")", &
1422 "than available on the device (",
accel%max_workgroup_size,
")"
1423 call messages_info(1)
1427 workgroup_size = 256
1429 workgroup_size = min(workgroup_size, max_workgroup_size)
1438 type(cl_program),
intent(inout) :: prog
1439 character(len=*),
intent(in) :: filename
1440 character(len=*),
optional,
intent(in) :: flags
1442 character(len = 1000) :: string
1443 character(len = 256) :: share_string
1444 integer :: ierr, ierrlog, iunit, irec, newlen
1448 string =
'#include "'//trim(filename)//
'"'
1450 call messages_write(
"Building CL program '"//trim(filename)//
"'.")
1451 call messages_info(debug_only=.
true.)
1453 prog = clcreateprogramwithsource(
accel%context%cl_context, trim(string), ierr)
1459 string=trim(string)//
' -cl-denorms-are-zero'
1462 string=trim(string)//
' -cl-mad-enable'
1463 string=trim(string)//
' -cl-unsafe-math-optimizations'
1464 string=trim(string)//
' -cl-finite-math-only'
1465 string=trim(string)//
' -cl-fast-relaxed-math'
1467 share_string=
'-I'//trim(conf%share)//
'/opencl/'
1470 string = trim(string)//
' -DEXT_KHR_FP64'
1472 string = trim(string)//
' -DEXT_AMD_FP64'
1474 call messages_write(
'Octopus requires an OpenCL device with double-precision support.')
1475 call messages_fatal()
1479 string = trim(string)//
' -DSHARED_MEM'
1482 if (
present(flags))
then
1483 string = trim(string)//
' '//trim(flags)
1486 call messages_write(
"Debug info: compilation flags '"//trim(string), new_line = .
true.)
1487 call messages_write(
' '//trim(share_string)//
"'.")
1488 call messages_info(debug_only=.
true.)
1490 string = trim(string)//
' '//trim(share_string)
1492 call clbuildprogram(prog, trim(string), ierr)
1494 if(ierr /= cl_success)
then
1495 call clgetprogrambuildinfo(prog,
accel%device%cl_device, cl_program_build_log, string, ierrlog)
1499 newlen = scan(string, achar(010), back = .
true.) - 1
1500 if (newlen >= 0) string = string(1:newlen)
1502 if (len(trim(string)) > 0)
write(stderr,
'(a)') trim(string)
1514 type(cl_program),
intent(inout) :: prog
1520 call clreleaseprogram(prog, ierr)
1531 type(cl_kernel),
intent(inout) :: prog
1538 call clreleasekernel(prog, ierr)
1549 type(cl_kernel),
intent(inout) :: kernel
1550 type(cl_program),
intent(inout) :: prog
1551 character(len=*),
intent(in) :: name
1556 call profiling_in(
"CL_BUILD_KERNEL", exclude = .
true.)
1559 kernel = clcreatekernel(prog, name, ierr)
1563 call profiling_out(
"CL_BUILD_KERNEL")
1571 integer,
intent(in) :: ierr
1572 character(len=*),
intent(in) :: name
1574 character(len=40) :: errcode
1579 case (cl_success); errcode =
'CL_SUCCESS '
1580 case (cl_device_not_found); errcode =
'CL_DEVICE_NOT_FOUND '
1581 case (cl_device_not_available); errcode =
'CL_DEVICE_NOT_AVAILABLE '
1582 case (cl_compiler_not_available); errcode =
'CL_COMPILER_NOT_AVAILABLE '
1583 case (cl_mem_object_allocation_failure); errcode =
'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1584 case (cl_out_of_resources); errcode =
'CL_OUT_OF_RESOURCES '
1585 case (cl_out_of_host_memory); errcode =
'CL_OUT_OF_HOST_MEMORY '
1586 case (cl_profiling_info_not_available); errcode =
'CL_PROFILING_INFO_NOT_AVAILABLE '
1587 case (cl_mem_copy_overlap); errcode =
'CL_MEM_COPY_OVERLAP '
1588 case (cl_image_format_mismatch); errcode =
'CL_IMAGE_FORMAT_MISMATCH '
1589 case (cl_image_format_not_supported); errcode =
'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1590 case (cl_build_program_failure); errcode =
'CL_BUILD_PROGRAM_FAILURE '
1591 case (cl_map_failure); errcode =
'CL_MAP_FAILURE '
1592 case (cl_invalid_value); errcode =
'CL_INVALID_VALUE '
1593 case (cl_invalid_device_type); errcode =
'CL_INVALID_DEVICE_TYPE '
1594 case (cl_invalid_platform); errcode =
'CL_INVALID_PLATFORM '
1595 case (cl_invalid_device); errcode =
'CL_INVALID_DEVICE '
1596 case (cl_invalid_context); errcode =
'CL_INVALID_CONTEXT '
1597 case (cl_invalid_queue_properties); errcode =
'CL_INVALID_QUEUE_PROPERTIES '
1598 case (cl_invalid_command_queue); errcode =
'CL_INVALID_COMMAND_QUEUE '
1599 case (cl_invalid_host_ptr); errcode =
'CL_INVALID_HOST_PTR '
1600 case (cl_invalid_mem_object); errcode =
'CL_INVALID_MEM_OBJECT '
1601 case (cl_invalid_image_format_descriptor); errcode =
'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1602 case (cl_invalid_image_size); errcode =
'CL_INVALID_IMAGE_SIZE '
1603 case (cl_invalid_sampler); errcode =
'CL_INVALID_SAMPLER '
1604 case (cl_invalid_binary); errcode =
'CL_INVALID_BINARY '
1605 case (cl_invalid_build_options); errcode =
'CL_INVALID_BUILD_OPTIONS '
1606 case (cl_invalid_program); errcode =
'CL_INVALID_PROGRAM '
1607 case (cl_invalid_program_executable); errcode =
'CL_INVALID_PROGRAM_EXECUTABLE '
1608 case (cl_invalid_kernel_name); errcode =
'CL_INVALID_KERNEL_NAME '
1609 case (cl_invalid_kernel_definition); errcode =
'CL_INVALID_KERNEL_DEFINITION '
1610 case (cl_invalid_kernel); errcode =
'CL_INVALID_KERNEL '
1611 case (cl_invalid_arg_index); errcode =
'CL_INVALID_ARG_INDEX '
1612 case (cl_invalid_arg_value); errcode =
'CL_INVALID_ARG_VALUE '
1613 case (cl_invalid_arg_size); errcode =
'CL_INVALID_ARG_SIZE '
1614 case (cl_invalid_kernel_args); errcode =
'CL_INVALID_KERNEL_ARGS '
1615 case (cl_invalid_work_dimension); errcode =
'CL_INVALID_WORK_DIMENSION '
1616 case (cl_invalid_work_group_size); errcode =
'CL_INVALID_WORK_GROUP_SIZE '
1617 case (cl_invalid_work_item_size); errcode =
'CL_INVALID_WORK_ITEM_SIZE '
1618 case (cl_invalid_global_offset); errcode =
'CL_INVALID_GLOBAL_OFFSET '
1619 case (cl_invalid_event_wait_list); errcode =
'CL_INVALID_EVENT_WAIT_LIST '
1620 case (cl_invalid_event); errcode =
'CL_INVALID_EVENT '
1621 case (cl_invalid_operation); errcode =
'CL_INVALID_OPERATION '
1622 case (cl_invalid_gl_object); errcode =
'CL_INVALID_GL_OBJECT '
1623 case (cl_invalid_buffer_size); errcode =
'CL_INVALID_BUFFER_SIZE '
1624 case (cl_invalid_mip_level); errcode =
'CL_INVALID_MIP_LEVEL '
1625 case (cl_invalid_global_work_size); errcode =
'CL_INVALID_GLOBAL_WORK_SIZE '
1626 case (cl_platform_not_found_khr); errcode =
'CL_PLATFORM_NOT_FOUND_KHR'
1628 write(errcode,
'(i10)') ierr
1629 errcode =
'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//
')'
1632 message(1) =
'OpenCL '//trim(name)//
' '//trim(errcode)
1633 call messages_fatal(1)
1642 integer,
intent(in) :: ierr
1643 character(len=*),
intent(in) :: name
1645 character(len=40) :: errcode
1648#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1650 case (clblassuccess); errcode =
'clblasSuccess'
1651 case (clblasinvalidvalue); errcode =
'clblasInvalidValue'
1652 case (clblasinvalidcommandqueue); errcode =
'clblasInvalidCommandQueue'
1653 case (clblasinvalidcontext); errcode =
'clblasInvalidContext'
1654 case (clblasinvalidmemobject); errcode =
'clblasInvalidMemObject'
1655 case (clblasinvaliddevice); errcode =
'clblasInvalidDevice'
1656 case (clblasinvalideventwaitlist); errcode =
'clblasInvalidEventWaitList'
1657 case (clblasoutofresources); errcode =
'clblasOutOfResources'
1658 case (clblasoutofhostmemory); errcode =
'clblasOutOfHostMemory'
1659 case (clblasinvalidoperation); errcode =
'clblasInvalidOperation'
1660 case (clblascompilernotavailable); errcode =
'clblasCompilerNotAvailable'
1661 case (clblasbuildprogramfailure); errcode =
'clblasBuildProgramFailure'
1662 case (clblasnotimplemented); errcode =
'clblasNotImplemented'
1663 case (clblasnotinitialized); errcode =
'clblasNotInitialized'
1664 case (clblasinvalidmata); errcode =
'clblasInvalidMatA'
1665 case (clblasinvalidmatb); errcode =
'clblasInvalidMatB'
1666 case (clblasinvalidmatc); errcode =
'clblasInvalidMatC'
1667 case (clblasinvalidvecx); errcode =
'clblasInvalidVecX'
1668 case (clblasinvalidvecy); errcode =
'clblasInvalidVecY'
1669 case (clblasinvaliddim); errcode =
'clblasInvalidDim'
1670 case (clblasinvalidleaddima); errcode =
'clblasInvalidLeadDimA'
1671 case (clblasinvalidleaddimb); errcode =
'clblasInvalidLeadDimB'
1672 case (clblasinvalidleaddimc); errcode =
'clblasInvalidLeadDimC'
1673 case (clblasinvalidincx); errcode =
'clblasInvalidIncX'
1674 case (clblasinvalidincy); errcode =
'clblasInvalidIncY'
1675 case (clblasinsufficientmemmata); errcode =
'clblasInsufficientMemMatA'
1676 case (clblasinsufficientmemmatb); errcode =
'clblasInsufficientMemMatB'
1677 case (clblasinsufficientmemmatc); errcode =
'clblasInsufficientMemMatC'
1678 case (clblasinsufficientmemvecx); errcode =
'clblasInsufficientMemVecX'
1679 case (clblasinsufficientmemvecy); errcode =
'clblasInsufficientMemVecY'
1681 case (clblastinsufficientmemorytemp); errcode =
'clblastInsufficientMemoryTemp'
1682 case (clblastinvalidbatchcount); errcode =
'clblastInvalidBatchCount'
1683 case (clblastinvalidoverridekernel); errcode =
'clblastInvalidOverrideKernel'
1684 case (clblastmissingoverrideparameter); errcode =
'clblastMissingOverrideParameter'
1685 case (clblastinvalidlocalmemusage); errcode =
'clblastInvalidLocalMemUsage'
1686 case (clblastnohalfprecision); errcode =
'clblastNoHalfPrecision'
1687 case (clblastnodoubleprecision); errcode =
'clblastNoDoublePrecision'
1688 case (clblastinvalidvectorscalar); errcode =
'clblastInvalidVectorScalar'
1689 case (clblastinsufficientmemoryscalar); errcode =
'clblastInsufficientMemoryScalar'
1690 case (clblastdatabaseerror); errcode =
'clblastDatabaseError'
1691 case (clblastunknownerror); errcode =
'clblastUnknownError'
1692 case (clblastunexpectederror); errcode =
'clblastUnexpectedError'
1696 write(errcode,
'(i10)') ierr
1697 errcode =
'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//
')'
1701 message(1) =
'Error in calling clblas routine '//trim(name)//
' : '//trim(errcode)
1702 call messages_fatal(1)
1709 integer,
intent(in) :: ierr
1710 character(len=*),
intent(in) :: name
1712 character(len=40) :: errcode
1717 case (clfft_invalid_global_work_size); errcode =
'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1718 case (clfft_invalid_mip_level); errcode =
'CLFFT_INVALID_MIP_LEVEL'
1719 case (clfft_invalid_buffer_size); errcode =
'CLFFT_INVALID_BUFFER_SIZE'
1720 case (clfft_invalid_gl_object); errcode =
'CLFFT_INVALID_GL_OBJECT'
1721 case (clfft_invalid_operation); errcode =
'CLFFT_INVALID_OPERATION'
1722 case (clfft_invalid_event); errcode =
'CLFFT_INVALID_EVENT'
1723 case (clfft_invalid_event_wait_list); errcode =
'CLFFT_INVALID_EVENT_WAIT_LIST'
1724 case (clfft_invalid_global_offset); errcode =
'CLFFT_INVALID_GLOBAL_OFFSET'
1725 case (clfft_invalid_work_item_size); errcode =
'CLFFT_INVALID_WORK_ITEM_SIZE'
1726 case (clfft_invalid_work_group_size); errcode =
'CLFFT_INVALID_WORK_GROUP_SIZE'
1727 case (clfft_invalid_work_dimension); errcode =
'CLFFT_INVALID_WORK_DIMENSION'
1728 case (clfft_invalid_kernel_args); errcode =
'CLFFT_INVALID_KERNEL_ARGS'
1729 case (clfft_invalid_arg_size); errcode =
'CLFFT_INVALID_ARG_SIZE'
1730 case (clfft_invalid_arg_value); errcode =
'CLFFT_INVALID_ARG_VALUE'
1731 case (clfft_invalid_arg_index); errcode =
'CLFFT_INVALID_ARG_INDEX'
1732 case (clfft_invalid_kernel); errcode =
'CLFFT_INVALID_KERNEL'
1733 case (clfft_invalid_kernel_definition); errcode =
'CLFFT_INVALID_KERNEL_DEFINITION'
1734 case (clfft_invalid_kernel_name); errcode =
'CLFFT_INVALID_KERNEL_NAME'
1735 case (clfft_invalid_program_executable); errcode =
'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1736 case (clfft_invalid_program); errcode =
'CLFFT_INVALID_PROGRAM'
1737 case (clfft_invalid_build_options); errcode =
'CLFFT_INVALID_BUILD_OPTIONS'
1738 case (clfft_invalid_binary); errcode =
'CLFFT_INVALID_BINARY'
1739 case (clfft_invalid_sampler); errcode =
'CLFFT_INVALID_SAMPLER'
1740 case (clfft_invalid_image_size); errcode =
'CLFFT_INVALID_IMAGE_SIZE'
1741 case (clfft_invalid_image_format_descriptor); errcode =
'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1742 case (clfft_invalid_mem_object); errcode =
'CLFFT_INVALID_MEM_OBJECT'
1743 case (clfft_invalid_host_ptr); errcode =
'CLFFT_INVALID_HOST_PTR'
1744 case (clfft_invalid_command_queue); errcode =
'CLFFT_INVALID_COMMAND_QUEUE'
1745 case (clfft_invalid_queue_properties); errcode =
'CLFFT_INVALID_QUEUE_PROPERTIES'
1746 case (clfft_invalid_context); errcode =
'CLFFT_INVALID_CONTEXT'
1747 case (clfft_invalid_device); errcode =
'CLFFT_INVALID_DEVICE'
1748 case (clfft_invalid_platform); errcode =
'CLFFT_INVALID_PLATFORM'
1749 case (clfft_invalid_device_type); errcode =
'CLFFT_INVALID_DEVICE_TYPE'
1750 case (clfft_invalid_value); errcode =
'CLFFT_INVALID_VALUE'
1751 case (clfft_map_failure); errcode =
'CLFFT_MAP_FAILURE'
1752 case (clfft_build_program_failure); errcode =
'CLFFT_BUILD_PROGRAM_FAILURE'
1753 case (clfft_image_format_not_supported); errcode =
'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1754 case (clfft_image_format_mismatch); errcode =
'CLFFT_IMAGE_FORMAT_MISMATCH'
1755 case (clfft_mem_copy_overlap); errcode =
'CLFFT_MEM_COPY_OVERLAP'
1756 case (clfft_profiling_info_not_available); errcode =
'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1757 case (clfft_out_of_host_memory); errcode =
'CLFFT_OUT_OF_HOST_MEMORY'
1758 case (clfft_out_of_resources); errcode =
'CLFFT_OUT_OF_RESOURCES'
1759 case (clfft_mem_object_allocation_failure); errcode =
'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1760 case (clfft_compiler_not_available); errcode =
'CLFFT_COMPILER_NOT_AVAILABLE'
1761 case (clfft_device_not_available); errcode =
'CLFFT_DEVICE_NOT_AVAILABLE'
1762 case (clfft_device_not_found); errcode =
'CLFFT_DEVICE_NOT_FOUND'
1763 case (clfft_success); errcode =
'CLFFT_SUCCESS'
1764 case (clfft_bugcheck); errcode =
'CLFFT_BUGCHECK'
1765 case (clfft_notimplemented); errcode =
'CLFFT_NOTIMPLEMENTED'
1766 case (clfft_file_not_found); errcode =
'CLFFT_FILE_NOT_FOUND'
1767 case (clfft_file_create_failure); errcode =
'CLFFT_FILE_CREATE_FAILURE'
1768 case (clfft_version_mismatch); errcode =
'CLFFT_VERSION_MISMATCH'
1769 case (clfft_invalid_plan); errcode =
'CLFFT_INVALID_PLAN'
1770 case (clfft_device_no_double); errcode =
'CLFFT_DEVICE_NO_DOUBLE'
1771 case (clfft_endstatus); errcode =
'CLFFT_ENDSTATUS'
1773 write(errcode,
'(i10)') ierr
1774 errcode =
'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//
')'
1778 message(1) =
'clfft '//trim(name)//
' '//trim(errcode)
1779 call messages_fatal(1)
1788 type(cl_device_id),
intent(inout) :: device
1789 character(len=*),
intent(in) :: extension
1792 character(len=2048) :: all_extensions
1795 call clgetdeviceinfo(device, cl_device_extensions, all_extensions,
cl_status)
1798 has = index(all_extensions, extension) /= 0
1805 integer(int64) pure function
opencl_pad(size, blk) result(pad)
1806 integer(int64),
intent(in) :: size
1807 integer,
intent(in) :: blk
1809 integer(int64) :: mm
1815 pad =
size + blk - mm
1823 type(type_t),
intent(in) :: type
1824 integer(int64),
intent(in) :: nval
1825 integer(int64),
optional,
intent(in) :: offset
1826 logical,
optional,
intent(in) :: async
1829 integer(int64) :: nval_real, offset_real
1836 if (
type == type_cmplx) nval_real = nval_real * 2
1837 if (
present(offset))
then
1838 offset_real = offset
1839 if (
type == type_cmplx) offset_real = offset_real * 2
1841 offset_real = 0_int64
1844 assert(nval_real > 0)
1854 if(.not. optional_default(async, .false.))
call accel_finish()
1865 type(type_t),
intent(in) :: type
1866 integer(int32),
intent(in) :: nval
1867 integer(int32),
optional,
intent(in) :: offset
1868 logical,
optional,
intent(in) :: async
1872 if (
present(offset))
then
1885 integer,
parameter :: times = 10
1887 real(real64) :: time, stime
1888 real(real64) :: read_bw, write_bw
1890 real(real64),
allocatable :: data(:)
1892 call messages_new_line()
1893 call messages_write(
'Info: Benchmarking the bandwidth between main memory and device memory')
1894 call messages_new_line()
1895 call messages_info()
1897 call messages_write(
' Buffer size Read bw Write bw')
1898 call messages_new_line()
1899 call messages_write(
' [MiB] [MiB/s] [MiB/s]')
1900 call messages_info()
1904 safe_allocate(
data(1:size))
1907 stime = loct_clock()
1912 time = (loct_clock() - stime)/real(times, real64)
1914 write_bw = real(
size, real64) *8.0_real64/time
1916 stime = loct_clock()
1922 time = (loct_clock() - stime)/real(times, real64)
1923 read_bw = real(
size, real64) *8.0_real64/time
1925 call messages_write(size*8.0_real64/1024.0_real64**2)
1926 call messages_write(write_bw/1024.0_real64**2, fmt =
'(f10.1)')
1927 call messages_write(read_bw/1024.0_real64**2, fmt =
'(f10.1)')
1928 call messages_info()
1932 safe_deallocate_a(data)
1934 size = int(size*2.0)
1936 if (
size > 50000000)
exit
1942 logical pure function accel_use_shared_mem() result(use_shared_mem)
1944 use_shared_mem =
accel%shared_mem
1956 call cuda_module_map_init(
accel%module_map)
1969 if (.not.
associated(
head))
exit
1970 next_head =>
head%next
1976 call cuda_module_map_end(
accel%module_map)
1986 character(len=*),
intent(in) :: file_name
1987 character(len=*),
intent(in) :: kernel_name
1988 character(len=*),
optional,
intent(in) :: flags
1991 type(cl_program) :: prog
1994 character(len=1000) :: all_flags
1999 call profiling_in(
"ACCEL_COMPILE", exclude = .
true.)
2002 all_flags =
'-I'//trim(conf%share)//
'/opencl/'//
" "//trim(
accel%debug_flag)
2005 all_flags = trim(all_flags)//
' -DSHARED_MEM'
2008 if (
present(flags))
then
2009 all_flags = trim(all_flags)//
' '//trim(flags)
2012 call cuda_build_program(
accel%module_map, this%cuda_module,
accel%device%cuda_device, trim(file_name), trim(all_flags))
2014 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2015 call cuda_alloc_arg_array(this%arguments)
2017 this%cuda_shared_mem = 0
2026 this%initialized = .
true.
2027 this%kernel_name = trim(kernel_name)
2029 call profiling_out(
"ACCEL_COMPILE")
2045 call cuda_free_arg_array(this%arguments)
2046 call cuda_release_kernel(this%cuda_kernel)
2051 call clreleasekernel(this%kernel, ierr)
2054 this%initialized = .false.
2063 character(len=*),
intent(in) :: file_name
2064 character(len=*),
intent(in) :: kernel_name
2065 character(len=*),
optional,
intent(in) :: flags
2069 if (.not. this%initialized)
then
2082 size =
accel%global_memory_size
2090 size =
accel%local_memory_size
2097 integer,
intent(in) :: dim
2103 if (dim == 1)
size = 2**30
2107 if (dim == 1)
size = 2**30
2114 integer,
intent(in) :: stream_number
2120 call cuda_set_stream(
accel%cuda_stream, stream_number)
2121 call cublas_set_stream(
accel%cublas_handle,
accel%cuda_stream)
2131 integer,
intent(inout) :: stream_number
2137 call cuda_get_stream(stream_number)
2151 call cuda_synchronize_all_streams()
2159 type(c_ptr),
intent(in) :: buffer
2160 integer(int64),
intent(in) :: offset
2161 type(c_ptr) :: buffer_offset
2165 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2168 buffer_offset = buffer
2174 type(c_ptr),
intent(in) :: buffer
2175 integer(int64),
intent(in) :: offset
2176 type(c_ptr) :: buffer_offset
2180 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2183 buffer_offset = buffer
2189 type(c_ptr),
intent(in) :: buffer
2193 call cuda_clean_pointer(buffer)
2202 integer(int64),
intent(in) :: size
2203 integer(int64),
intent(out) :: grid_size
2204 integer(int64),
intent(out) :: thread_block_size
2207#ifdef __HIP_PLATFORM_AMD__
2210 thread_block_size =
size
2212 grid_size =
size *
accel%warp_size
2213 thread_block_size =
accel%warp_size
2220#include "accel_inc.F90"
2223#include "complex.F90"
2224#include "accel_inc.F90"
2227#include "integer.F90"
2228#include "accel_inc.F90"
2231#include "integer8.F90"
2232#include "accel_inc.F90"
subroutine select_device(idevice)
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
integer, parameter opencl_accelerator
type(accel_kernel_t), target, save, public kernel_density_real
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
integer, parameter opencl_default
type(accel_kernel_t), target, save, public kernel_vpsi_complex
type(accel_kernel_t), target, save, public dkernel_batch_axpy
subroutine, public accel_clean_pointer(buffer)
subroutine accel_kernel_global_end()
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
pure logical function, public accel_allow_cpu_only()
subroutine iaccel_read_buffer_3(this, size, data, offset, async)
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
subroutine laccel_read_buffer_2(this, size, data, offset, async)
logical pure function, public accel_use_shared_mem()
subroutine laccel_read_buffer_3_int32(this, size, data, offset, async)
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_0_int32(this, size, data, offset, async)
type(accel_kernel_t), target, save, public dkernel_batch_dotp
subroutine laccel_write_buffer_2(this, size, data, offset, async)
subroutine zaccel_write_buffer_3_int32(this, size, data, offset, async)
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_0(this, size, data, offset, async)
subroutine zaccel_write_buffer_single(this, data, async)
type(accel_kernel_t), target, save, public kernel_ghost_reorder
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), target, save, public zkernel_batch_axpy
integer, parameter cl_plat_nvidia
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_2_int32(this, size, data, offset, async)
subroutine daccel_read_buffer_3(this, size, data, offset, async)
integer, parameter cl_plat_ati
subroutine, public accel_get_stream(stream_number)
integer(int64) pure function, public accel_global_memory_size()
subroutine daccel_read_buffer_1_int32(this, size, data, offset, async)
subroutine iaccel_read_buffer_1(this, size, data, offset, async)
subroutine daccel_write_buffer_3_int32(this, size, data, offset, async)
subroutine zaccel_write_buffer_2(this, size, data, offset, async)
subroutine zaccel_write_buffer_1_int32(this, size, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_ax_function_py
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine laccel_read_buffer_0_int32(this, size, data, offset, async)
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_0_int32(this, size, data, offset, async)
integer(int64) function accel_padded_size_i8(nn)
subroutine iaccel_read_buffer_2_int32(this, size, data, offset, async)
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine, public accel_finish()
subroutine opencl_check_bandwidth()
subroutine accel_kernel_global_init()
subroutine daccel_write_buffer_1(this, size, data, offset, async)
type(accel_kernel_t), target, save, public kernel_daxpy
subroutine laccel_read_buffer_0(this, size, data, offset, async)
subroutine opencl_release_program(prog)
type(accel_kernel_t), save set_zero
subroutine zaccel_read_buffer_3(this, size, data, offset, async)
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
subroutine laccel_write_buffer_3(this, size, data, offset, async)
type(accel_kernel_t), target, save, public zzmul
integer, parameter cl_plat_invalid
type(accel_kernel_t), target, save, public kernel_density_spinors
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_read_buffer_2_int32(this, size, data, offset, async)
subroutine laccel_write_buffer_single(this, data, async)
subroutine daccel_write_buffer_2(this, size, data, offset, async)
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer(int64) pure function opencl_pad(size, blk)
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
logical pure function, public accel_buffer_is_allocated(this)
integer, parameter, public accel_mem_read_write
subroutine zaccel_read_buffer_0_int32(this, size, data, offset, async)
subroutine, public clfft_print_error(ierr, name)
subroutine accel_kernel_end(this)
subroutine iaccel_write_buffer_0_int32(this, size, data, offset, async)
integer, parameter opencl_gpu
type(accel_kernel_t), target, save, public dkernel_ax_function_py
subroutine iaccel_read_buffer_2(this, size, data, offset, async)
subroutine opencl_release_kernel(prog)
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_0(this, size, data, offset, async)
subroutine zaccel_read_buffer_1_int32(this, size, data, offset, async)
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
subroutine iaccel_write_buffer_single(this, data, async)
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
integer pure function, public accel_max_size_per_dim(dim)
type(accel_kernel_t), target, save, public dzmul
subroutine iaccel_write_buffer_3_int32(this, size, data, offset, async)
subroutine laccel_write_buffer_1(this, size, data, offset, async)
subroutine daccel_read_buffer_1(this, size, data, offset, async)
subroutine, public accel_release_buffer(this)
type(accel_kernel_t), target, save, public kernel_phase_spiral
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine zaccel_write_buffer_1(this, size, data, offset, async)
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
subroutine zaccel_read_buffer_2_int32(this, size, data, offset, async)
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
subroutine laccel_write_buffer_1_int32(this, size, data, offset, async)
subroutine laccel_read_buffer_3(this, size, data, offset, async)
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
subroutine opencl_build_program(prog, filename, flags)
subroutine laccel_write_buffer_2_int32(this, size, data, offset, async)
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
subroutine iaccel_write_buffer_2(this, size, data, offset, async)
subroutine, public accel_init(base_grp, namespace)
subroutine, public accel_end(namespace)
subroutine iaccel_read_buffer_0(this, size, data, offset, async)
subroutine opencl_create_kernel(kernel, prog, name)
subroutine, public accel_synchronize_all_streams()
subroutine iaccel_read_buffer_1_int32(this, size, data, offset, async)
subroutine, public accel_set_stream(stream_number)
type(accel_kernel_t), target, save, public zunpack
type(accel_kernel_t), target, save, public kernel_phase
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine zaccel_read_buffer_3_int32(this, size, data, offset, async)
subroutine laccel_read_buffer_2_int32(this, size, data, offset, async)
type(accel_kernel_t), target, save, public kernel_zaxpy
subroutine laccel_read_buffer_1_int32(this, size, data, offset, async)
integer, parameter cl_plat_amd
subroutine zaccel_read_buffer_2(this, size, data, offset, async)
integer(int32) function accel_padded_size_i4(nn)
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
subroutine daccel_write_buffer_2_int32(this, size, data, offset, async)
subroutine iaccel_write_buffer_1_int32(this, size, data, offset, async)
type(accel_kernel_t), target, save, public zkernel_batch_dotp
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
subroutine daccel_write_buffer_1_int32(this, size, data, offset, async)
subroutine iaccel_write_buffer_0(this, size, data, offset, async)
pure logical function, public accel_is_enabled()
subroutine zaccel_read_buffer_0(this, size, data, offset, async)
integer, parameter cl_plat_intel
integer, parameter, public accel_mem_write_only
subroutine daccel_read_buffer_3_int32(this, size, data, offset, async)
subroutine daccel_read_buffer_0_int32(this, size, data, offset, async)
subroutine laccel_write_buffer_0_int32(this, size, data, offset, async)
subroutine zaccel_read_buffer_1(this, size, data, offset, async)
type(accel_kernel_t), target, save, public kernel_vpsi
logical function f90_cl_device_has_extension(device, extension)
subroutine opencl_print_error(ierr, name)
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
subroutine zaccel_write_buffer_2_int32(this, size, data, offset, async)
subroutine iaccel_write_buffer_1(this, size, data, offset, async)
subroutine laccel_write_buffer_0(this, size, data, offset, async)
type(accel_kernel_t), target, save, public kernel_density_complex
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
subroutine daccel_write_buffer_single(this, data, async)
subroutine daccel_read_buffer_0(this, size, data, offset, async)
integer function, public accel_kernel_workgroup_size(kernel)
subroutine laccel_read_buffer_1(this, size, data, offset, async)
integer, parameter opencl_cpu
subroutine zaccel_write_buffer_3(this, size, data, offset, async)
integer function get_platform_id(platform_name)
subroutine, public clblas_print_error(ierr, name)
type(accel_t), public accel
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
subroutine laccel_write_buffer_3_int32(this, size, data, offset, async)
subroutine daccel_read_buffer_2(this, size, data, offset, async)
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
subroutine daccel_write_buffer_3(this, size, data, offset, async)
integer, public cl_status
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
subroutine iaccel_read_buffer_0_int32(this, size, data, offset, async)
type(accel_kernel_t), target, save, public dunpack
subroutine iaccel_write_buffer_3(this, size, data, offset, async)
integer(int64) pure function, public accel_local_memory_size()
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
integer pure function, public accel_max_workgroup_size()
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
type(accel_kernel_t), pointer head
subroutine iaccel_read_buffer_3_int32(this, size, data, offset, async)
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
integer, parameter, public clblassuccess
real(real64), parameter, public m_zero
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
character(len=512), private msg
subroutine, public messages_warning(no_lines, all_nodes, namespace)
subroutine, public messages_obsolete_variable(namespace, name, rep)
subroutine, public messages_new_line()
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
integer pure function, public types_get_size(this)
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)