Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
46 use math_oct_m
48 use mpi_oct_m
50 use types_oct_m
51 use parser_oct_m
55
56 implicit none
57
58 private
59
60 public :: &
65 accel_t, &
68 accel_init, &
69 accel_end, &
101
102#ifdef HAVE_OPENCL
103 integer, public, parameter :: &
104 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
105 accel_mem_read_write = cl_mem_read_write, &
106 accel_mem_write_only = cl_mem_write_only
107#else
108 integer, public, parameter :: &
109 ACCEL_MEM_READ_ONLY = 0, &
112#endif
113
114 type accel_context_t
115 ! Components are public by default
116#ifdef HAVE_OPENCL
117 type(cl_context) :: cl_context
118#elif defined(HAVE_CUDA)
119 type(c_ptr) :: cuda_context
120#else
121 integer :: dummy
122#endif
123 end type accel_context_t
124
125 type accel_device_t
126 ! Components are public by default
127#ifdef HAVE_OPENCL
128 type(cl_device_id) :: cl_device
129#elif defined(HAVE_CUDA)
130 type(c_ptr) :: cuda_device
131#else
132 integer :: dummy
133#endif
134 end type accel_device_t
135
136 type accel_t
137 ! Components are public by default
138 type(accel_context_t) :: context
139 type(accel_device_t) :: device
140#ifdef HAVE_OPENCL
141 type(cl_command_queue) :: command_queue
142#endif
143 type(c_ptr) :: cublas_handle
144 type(c_ptr) :: cuda_stream
145 type(c_ptr) :: module_map
146 integer :: max_workgroup_size
147 integer(int64) :: local_memory_size
148 integer(int64) :: global_memory_size
149 logical :: enabled
150 logical :: allow_CPU_only
151 logical :: shared_mem
152 logical :: cuda_mpi
153 integer :: warp_size
154 integer(int64) :: initialize_buffers
155 character(len=32) :: debug_flag
156 integer(int64) :: max_block_dim(3)
157 integer(int64) :: max_grid_dim(3)
158 end type accel_t
159
160 type accel_mem_t
161 ! Components are public by default
162#ifdef HAVE_OPENCL
163 type(cl_mem) :: mem
164#else
165 type(c_ptr) :: mem
166#endif
167 integer(c_size_t) :: size = 0
168 type(type_t) :: type
169 integer :: flags = 0
170 logical :: allocated = .false.
171 end type accel_mem_t
172
173 type accel_kernel_t
174 ! Components are public by default
175#ifdef HAVE_OPENCL
176 type(cl_kernel) :: kernel
177#endif
178#ifdef HAVE_CUDA
179 type(c_ptr) :: cuda_kernel
180 type(c_ptr) :: cuda_module
181 type(c_ptr) :: arguments
182#endif
183 integer(int64) :: cuda_shared_mem
184 logical :: initialized = .false.
185 type(accel_kernel_t), pointer :: next
186 integer :: arg_count
187 character(len=128) :: kernel_name
188 end type accel_kernel_t
189
190 type(accel_t), public :: accel
191
192 ! Global variables defined on device
193 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
194 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
196 ! the kernels
197 type(accel_kernel_t), public, target, save :: kernel_vpsi
198 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
199 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
200 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
201 type(accel_kernel_t), public, target, save :: kernel_daxpy
202 type(accel_kernel_t), public, target, save :: kernel_zaxpy
203 type(accel_kernel_t), public, target, save :: kernel_copy
204 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
205 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
206 type(accel_kernel_t), public, target, save :: dpack
207 type(accel_kernel_t), public, target, save :: zpack
208 type(accel_kernel_t), public, target, save :: dunpack
209 type(accel_kernel_t), public, target, save :: zunpack
210 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
211 type(accel_kernel_t), public, target, save :: kernel_density_real
212 type(accel_kernel_t), public, target, save :: kernel_density_complex
213 type(accel_kernel_t), public, target, save :: kernel_density_spinors
214 type(accel_kernel_t), public, target, save :: kernel_phase
215 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
216 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
217 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
218 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
219 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
220 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
221 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
222 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
223 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
224 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
225 type(accel_kernel_t), public, target, save :: dzmul
226 type(accel_kernel_t), public, target, save :: zzmul
227
228 interface accel_padded_size
230 end interface accel_padded_size
238 end interface accel_kernel_run
258 end interface accel_write_buffer
259
269 end interface accel_read_buffer
271 interface accel_set_kernel_arg
272 module procedure &
297 module procedure &
305 module procedure &
311
313 integer, parameter :: &
314 OPENCL_GPU = -1, &
315 opencl_cpu = -2, &
317 opencl_default = -4
318
319
320 integer, parameter :: &
321 CL_PLAT_INVALID = -1, &
322 cl_plat_amd = -2, &
323 cl_plat_nvidia = -3, &
325 cl_plat_intel = -5
326
327 ! a "convenience" public variable
328 integer, public :: cl_status
329
330 integer :: buffer_alloc_count
331 integer(int64) :: allocated_mem
332 type(accel_kernel_t), pointer :: head
333 type(alloc_cache_t) :: memcache
334
335contains
336
337 pure logical function accel_is_enabled() result(enabled)
338#ifdef HAVE_ACCEL
339 enabled = accel%enabled
340#else
341 enabled = .false.
342#endif
343 end function accel_is_enabled
344
345 ! ------------------------------------------
346
347 pure logical function accel_allow_cpu_only() result(allow)
348#ifdef HAVE_ACCEL
349 allow = accel%allow_CPU_only
350#else
351 allow = .true.
352#endif
353 end function accel_allow_cpu_only
354
355 ! ------------------------------------------
356
357 subroutine accel_init(base_grp, namespace)
358 type(mpi_grp_t), intent(inout) :: base_grp
359 type(namespace_t), intent(in) :: namespace
360
361 logical :: disable, default, run_benchmark
362 integer :: idevice, iplatform
363#ifdef HAVE_OPENCL
364 integer :: device_type
365 integer :: cl_status, idev
366 integer :: ndevices, ret_devices, nplatforms, iplat
367 character(len=256) :: device_name
368 type(cl_platform_id) :: platform_id
369 type(cl_program) :: prog
370 type(cl_platform_id), allocatable :: allplatforms(:)
371 type(cl_device_id), allocatable :: alldevices(:)
372 integer :: max_work_item_dimensions
373 integer(int64), allocatable :: max_work_item_sizes(:)
374#endif
375#ifdef HAVE_CUDA
376 integer :: dim
377#ifdef HAVE_MPI
378 character(len=256) :: sys_name
379#endif
380#endif
381
382 push_sub(accel_init)
383
384 buffer_alloc_count = 0
385
386 !%Variable DisableAccel
387 !%Type logical
388 !%Default yes
389 !%Section Execution::Accel
390 !%Description
391 !% If Octopus was compiled with OpenCL or CUDA support, it will
392 !% try to initialize and use an accelerator device. By setting this
393 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
394 !%End
395 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
396#ifdef HAVE_ACCEL
397 default = .false.
398#else
399 default = .true.
400#endif
401 call parse_variable(namespace, 'DisableAccel', default, disable)
402 accel%enabled = .not. disable
403
404#ifndef HAVE_ACCEL
405 if (accel%enabled) then
406 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
407 call messages_fatal(1)
408 end if
409#endif
411 if (.not. accel_is_enabled()) then
412 pop_sub(accel_init)
413 return
414 end if
415
416 !%Variable AccelPlatform
417 !%Type integer
418 !%Default 0
419 !%Section Execution::Accel
420 !%Description
421 !% This variable selects the OpenCL platform that Octopus will
422 !% use. You can give an explicit platform number or use one of
423 !% the options that select a particular vendor
424 !% implementation. Platform 0 is used by default.
425 !%
426 !% This variable has no effect for CUDA.
427 !%Option amd -2
428 !% Use the AMD OpenCL platform.
429 !%Option nvidia -3
430 !% Use the Nvidia OpenCL platform.
431 !%Option ati -4
432 !% Use the ATI (old AMD) OpenCL platform.
433 !%Option intel -5
434 !% Use the Intel OpenCL platform.
435 !%End
436 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
438 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
439
440 !%Variable AccelDevice
441 !%Type integer
442 !%Default gpu
443 !%Section Execution::Accel
444 !%Description
445 !% This variable selects the OpenCL or CUDA accelerator device
446 !% that Octopus will use. You can specify one of the options below
447 !% or a numerical id to select a specific device.
448 !%
449 !% Values >= 0 select the device to be used. In case of MPI enabled runs
450 !% devices are distributed in a round robin fashion, starting at this value.
451 !%Option gpu -1
452 !% If available, Octopus will use a GPU.
453 !%Option cpu -2
454 !% If available, Octopus will use a CPU (only for OpenCL).
455 !%Option accelerator -3
456 !% If available, Octopus will use an accelerator (only for OpenCL).
457 !%Option accel_default -4
458 !% Octopus will use the default device specified by the implementation.
459 !% implementation.
460 !%End
461 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
462
463 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
464
465 if (idevice < opencl_default) then
466 call messages_write('Invalid AccelDevice')
467 call messages_fatal()
468 end if
469
470 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
471
472#ifdef HAVE_CUDA
473 if (idevice<0) idevice = 0
474 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
475 idevice, base_grp%rank)
476#ifdef HAVE_MPI
477 call loct_sysname(sys_name)
478 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
479 " on ", trim(sys_name)
480 call messages_info(1, all_nodes = .true.)
481#endif
482
483 ! no shared mem support in our cuda interface (for the moment)
484 accel%shared_mem = .true.
485
486 call cublas_init(accel%cublas_handle, accel%cuda_stream)
487#endif
488
489#ifdef HAVE_OPENCL
490 call profiling_in('CL_INIT')
491
492 call clgetplatformids(nplatforms, cl_status)
493 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
494
495 safe_allocate(allplatforms(1:nplatforms))
496
497 call clgetplatformids(allplatforms, iplat, cl_status)
498 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
499
500 call messages_write('Info: Available CL platforms: ')
501 call messages_write(nplatforms)
502 call messages_info()
503
504 do iplat = 1, nplatforms
505
506 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
507
508 if (iplatform < 0) then
509 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
510 end if
511
512 if (iplatform == iplat - 1) then
513 call messages_write(' * Platform ')
514 else
515 call messages_write(' Platform ')
516 end if
517
518 call messages_write(iplat - 1)
519 call messages_write(' : '//device_name)
520 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
521 call messages_write(' ('//trim(device_name)//')')
522 call messages_info()
523 end do
524
525 call messages_info()
526
527 if (iplatform >= nplatforms .or. iplatform < 0) then
528 call messages_write('Requested CL platform does not exist')
529 if (iplatform > 0) then
530 call messages_write('(platform = ')
531 call messages_write(iplatform)
532 call messages_write(').')
533 end if
534 call messages_fatal()
535 end if
536
537 platform_id = allplatforms(iplatform + 1)
538
539 safe_deallocate_a(allplatforms)
540
541 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
542
543 call messages_write('Info: Available CL devices: ')
544 call messages_write(ndevices)
545 call messages_info()
546
547 safe_allocate(alldevices(1:ndevices))
548
549 ! list all devices
550
551 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
552
553 do idev = 1, ndevices
554 call messages_write(' Device ')
555 call messages_write(idev - 1)
556 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
557 call messages_write(' : '//device_name)
558 call messages_info()
559 end do
560
561 select case (idevice)
562 case (opencl_gpu)
563 device_type = cl_device_type_gpu
564 case (opencl_cpu)
565 device_type = cl_device_type_cpu
566 case (opencl_accelerator)
567 device_type = cl_device_type_accelerator
568 case (opencl_default)
569 device_type = cl_device_type_default
570 case default
571 device_type = cl_device_type_all
572 end select
573
574 ! now get a list of the selected type
575 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
576
577 if (ret_devices < 1) then
578 ! we didnt find a device of the selected type, we ask for the default device
579 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
580
581 if (ret_devices < 1) then
582 ! if this does not work, we ask for all devices
583 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
584 end if
585
586 if (ret_devices < 1) then
587 call messages_write('Cannot find an OpenCL device')
588 call messages_fatal()
589 end if
590 end if
591
592 ! the number of devices can be smaller
593 ndevices = ret_devices
594
595 if (idevice < 0) then
596 if (base_grp%size > 1) then
597 ! with MPI we have to select the device so multiple GPUs in one
598 ! node are correctly distributed
599 call select_device(idevice)
600 else
601 idevice = 0
602 end if
603 end if
604
605 if (idevice >= ndevices) then
606 call messages_write('Requested CL device does not exist (device = ')
607 call messages_write(idevice)
608 call messages_write(', platform = ')
609 call messages_write(iplatform)
610 call messages_write(').')
611 call messages_fatal()
612 end if
613
614 accel%device%cl_device = alldevices(idevice + 1)
615
616 ! create the context
617 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
618 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
619
620 safe_deallocate_a(alldevices)
621
622 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
623 cl_queue_profiling_enable, cl_status)
624 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
625
626 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
627
628 select case (device_type)
629 case (cl_device_type_gpu)
630 accel%shared_mem = .true.
631 case (cl_device_type_cpu, cl_device_type_accelerator)
632 accel%shared_mem = .false.
633 case default
634 accel%shared_mem = .false.
635 end select
636
637#ifdef HAVE_CLBLAS
638 call clblassetup(cl_status)
639 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
640#endif
641
642#ifdef HAVE_CLFFT
643 call clfftsetup(cl_status)
644 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
645#endif
646
647 call profiling_out('CL_INIT')
648#endif
649
650 ! Get some device information that we will need later
651
652 ! total memory
653#ifdef HAVE_OPENCL
654 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
655 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
656 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
657 accel%warp_size = 1
658 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_dimensions, max_work_item_dimensions, cl_status)
659 if (max_work_item_dimensions < 3) then
660 message(1) = "Octopus requires a device where CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS is at least 3."
661 call messages_fatal(1, only_root_writes = .true., namespace=namespace)
662 end if
663 safe_allocate(max_work_item_sizes(1:max_work_item_dimensions))
664 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_item_sizes, max_work_item_sizes(1), cl_status)
665 accel%max_block_dim(:) = max_work_item_sizes(1:3)
666 safe_deallocate_a(max_work_item_sizes)
667 ! In principle OpenCL does not set any limits on the global_work_size. It is
668 ! only limited by the available resources. Therefore we use the default
669 ! values for NVIDIA GPUs starting at CC 5.0. No idea whether these will work
670 ! generically.
671 ! https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features
672 ! -and-technical-specifications-technical-specifications-per-compute-capability
673 accel%max_grid_dim(1) = (2_int64)**31 - 1_int64
674 accel%max_grid_dim(2) = 65536_int64
675 accel%max_grid_dim(3) = 65536_int64
676#endif
677#ifdef HAVE_CUDA
678 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
679 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
680 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
681 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
682 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
683 accel%max_block_dim(1) = int(dim, int64)
684 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
685 accel%max_block_dim(2) = int(dim, int64)
686 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
687 accel%max_block_dim(3) = int(dim, int64)
688 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
689 accel%max_grid_dim(1) = int(dim, int64)
690 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
691 accel%max_grid_dim(2) = int(dim, int64)
692 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
693 accel%max_grid_dim(3) = int(dim, int64)
694#endif
695
696 if (base_grp%is_root()) call device_info()
697
698 ! initialize the cache used to speed up allocations
699 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
700
701 ! now initialize the kernels
703
704#if defined(HAVE_HIP)
705 accel%debug_flag = "-g"
706#elif defined(HAVE_CUDA)
707 accel%debug_flag = "-lineinfo"
708#elif defined(HAVE_OPENCL)
709 accel%debug_flag = "-g"
710#endif
711
712 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
713 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
714 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
715 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
716 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
717 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
718 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
719 flags = ' -DRTYPE_DOUBLE')
720 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
721 flags = '-DRTYPE_COMPLEX')
722 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
723 flags = '-DRTYPE_DOUBLE')
724 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
725 flags = '-DRTYPE_COMPLEX')
726 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
727 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
728 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
729 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
730 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
731 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
732 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
733 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cl', "copy_complex_to_real")
734 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cl', "copy_real_to_complex")
735 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
736 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
737 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
738 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
739 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
740 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
741 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
742 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
743
745 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
746 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
747
748 ! Define global buffers
749 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
750 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
751 call accel_write_buffer(zm_0_buffer, m_z0)
752 end if
753 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
754 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
755 call accel_write_buffer(zm_1_buffer, m_z1)
756 end if
757 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
758 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
759 call accel_write_buffer(dm_0_buffer, m_zero)
760 end if
761 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
762 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
763 call accel_write_buffer(dm_1_buffer, m_one)
764 end if
765
766
767 !%Variable AccelBenchmark
768 !%Type logical
769 !%Default no
770 !%Section Execution::Accel
771 !%Description
772 !% If this variable is set to yes, Octopus will run some
773 !% routines to benchmark the performance of the accelerator device.
774 !%End
775 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
776
777 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
778
779 if (run_benchmark) then
781 end if
782
783 !%Variable GPUAwareMPI
784 !%Type logical
785 !%Section Execution::Accel
786 !%Description
787 !% If Octopus was compiled with GPU support and MPI support and if the MPI
788 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
789 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
790 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
791 !% going through the host memory.
792 !% The default is false, except when the configure switch --enable-cudampi is set, in which
793 !% case this variable is set to true.
794 !%End
795#ifdef HAVE_CUDA_MPI
796 default = .true.
797#else
798 default = .false.
799#endif
800 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
801 if (accel%cuda_mpi) then
802#ifndef HAVE_CUDA_MPI
803 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
804 call messages_warning()
805#endif
806 call messages_write("Using GPU-aware MPI.")
807 call messages_info()
808 end if
809
810
811 !%Variable AllowCPUonly
812 !%Type logical
813 !%Section Execution::Accel
814 !%Description
815 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
816 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
817 !% code execution also in these cases.
818 !%End
819#if defined (HAVE_ACCEL)
820 default = .false.
821#else
822 default = .true.
823#endif
824 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
825
826
827 !%Variable InitializeGPUBuffers
828 !%Type integer
829 !%Default no
830 !%Section Execution::Accel
831 !%Description
832 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
833 !%Option no 0
834 !% Do not initialize GPU buffers.
835 !%Option yes 1
836 !% Initialize GPU buffers to zero.
837 !%Option nan 2
838 !% Initialize GPU buffers to nan.
839 !%End
840 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
841 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
842 call messages_input_error(namespace, 'InitializeGPUBuffers')
843 end if
844
845
846 call messages_print_with_emphasis(namespace=namespace)
847
848 pop_sub(accel_init)
849
850 contains
852#if defined(HAVE_OPENCL)
853 subroutine select_device(idevice)
854 integer, intent(inout) :: idevice
855 integer :: irank
856 character(len=256) :: device_name
857
858 push_sub(accel_init.select_device)
859
860 idevice = mod(base_grp%rank, ndevices)
861
862 call base_grp%barrier()
863 call messages_write('Info: CL device distribution:')
865 do irank = 0, base_grp%size - 1
866 if (irank == base_grp%rank) then
867 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
868 call messages_write(' MPI node ')
869 call messages_write(base_grp%rank)
870 call messages_write(' -> CL device ')
871 call messages_write(idevice)
872 call messages_write(' : '//device_name)
873 call messages_info(all_nodes = .true.)
874 end if
875 call base_grp%barrier()
876 end do
877
878 pop_sub(accel_init.select_device)
879 end subroutine select_device
880#endif
881
882 subroutine device_info()
883#ifdef HAVE_OPENCL
884 integer(int64) :: val
885#endif
886#ifdef HAVE_CUDA
887 integer :: version
888#endif
889 integer :: major, minor
890 character(len=256) :: val_str
891
892 push_sub(accel_init.device_info)
893
894 call messages_new_line()
895 call messages_write('Selected device:')
896 call messages_new_line()
897
898#ifdef HAVE_OPENCL
899 call messages_write(' Framework : OpenCL')
900#endif
901#ifdef HAVE_CUDA
902#ifdef __HIP_PLATFORM_AMD__
903 call messages_write(' Framework : ROCm')
904#else
905 call messages_write(' Framework : CUDA')
906#endif
907#endif
908 call messages_info()
909
910#ifdef HAVE_CUDA
911 call messages_write(' Device type : GPU', new_line = .true.)
912#ifdef __HIP_PLATFORM_AMD__
913 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
914#else
915 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
916#endif
917#endif
918
919#ifdef HAVE_OPENCL
920 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
921 call messages_write(' Device type :')
922 select case (int(val, int32))
923 case (cl_device_type_gpu)
924 call messages_write(' GPU')
925 case (cl_device_type_cpu)
926 call messages_write(' CPU')
927 case (cl_device_type_accelerator)
928 call messages_write(' accelerator')
929 end select
930 call messages_new_line()
931
932 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
933 call messages_write(' Device vendor : '//trim(val_str))
934 call messages_new_line()
935#endif
936
937#ifdef HAVE_OPENCL
938 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
939#endif
940#ifdef HAVE_CUDA
941 call cuda_device_name(accel%device%cuda_device, val_str)
942#endif
943 call messages_write(' Device name : '//trim(val_str))
944 call messages_new_line()
945
946#ifdef HAVE_CUDA
947 call cuda_device_capability(accel%device%cuda_device, major, minor)
948#endif
949 call messages_write(' Cuda capabilities :')
950 call messages_write(major, fmt = '(i2)')
951 call messages_write('.')
952 call messages_write(minor, fmt = '(i1)')
953 call messages_new_line()
954
955 ! VERSION
956#ifdef HAVE_OPENCL
957 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
958 call messages_write(' Driver version : '//trim(val_str))
959#endif
960#ifdef HAVE_CUDA
961 call cuda_driver_version(version)
962 call messages_write(' Driver version : ')
963 call messages_write(version)
964#endif
965 call messages_new_line()
966
967
968#ifdef HAVE_OPENCL
969 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
970 call messages_write(' Compute units :')
971 call messages_write(val)
972 call messages_new_line()
973
974 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
975 call messages_write(' Clock frequency :')
976 call messages_write(val)
977 call messages_write(' GHz')
978 call messages_new_line()
979#endif
980
981 call messages_write(' Device memory :')
982 call messages_write(accel%global_memory_size, units=unit_megabytes)
984
985 call messages_write(' Local/shared memory :')
986 call messages_write(accel%local_memory_size, units=unit_kilobytes)
987 call messages_new_line()
988
989
990#ifdef HAVE_OPENCL
991 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
992 call messages_write(' Max alloc size :')
993 call messages_write(val, units = unit_megabytes)
994 call messages_new_line()
995
996 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
997 call messages_write(' Device cache :')
998 call messages_write(val, units = unit_kilobytes)
999 call messages_new_line()
1000
1001 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
1002 call messages_write(' Constant memory :')
1004 call messages_new_line()
1005#endif
1006
1007 call messages_write(' Max. group/block size :')
1008 call messages_write(accel%max_workgroup_size)
1009 call messages_new_line()
1010
1011
1012#ifdef HAVE_OPENCL
1013 call messages_write(' Extension cl_khr_fp64 :')
1014 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
1015 call messages_new_line()
1016
1017 call messages_write(' Extension cl_amd_fp64 :')
1018 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
1019 call messages_new_line()
1021 call messages_write(' Extension cl_khr_int64_base_atomics :')
1022 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
1023 call messages_new_line()
1024
1025#endif
1026
1027 call messages_info()
1028
1029
1030 pop_sub(accel_init.device_info)
1031 end subroutine device_info
1032
1033 end subroutine accel_init
1034
1035 ! ------------------------------------------
1036#ifdef HAVE_OPENCL
1037 integer function get_platform_id(platform_name) result(platform_id)
1038 character(len=*), intent(in) :: platform_name
1039
1040 platform_id = cl_plat_invalid
1041 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
1042 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
1043 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
1044 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
1045 end function get_platform_id
1046#endif
1047 ! ------------------------------------------
1048
1049 subroutine accel_end(namespace)
1050 type(namespace_t), intent(in) :: namespace
1051
1052#ifdef HAVE_OPENCL
1053 integer :: ierr
1054#endif
1055 integer(int64) :: hits, misses
1056 real(real64) :: volume_hits, volume_misses
1057 logical :: found
1058 type(accel_mem_t) :: tmp
1059
1060 push_sub(accel_end)
1061
1062 if (accel_is_enabled()) then
1063
1064 ! Release global buffers
1065 call accel_release_buffer(zm_0_buffer)
1066 call accel_release_buffer(zm_1_buffer)
1067 call accel_release_buffer(dm_0_buffer)
1068 call accel_release_buffer(dm_1_buffer)
1069
1070 do
1071 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
1072 if (.not. found) exit
1073
1074#ifdef HAVE_OPENCL
1075 call clreleasememobject(tmp%mem, ierr)
1076 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1077#endif
1078#ifdef HAVE_CUDA
1079 call cuda_mem_free(tmp%mem)
1080#endif
1081 end do
1082
1083 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1084
1085 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1087 call messages_new_line()
1088 call messages_write(' Number of allocations =')
1089 call messages_write(hits + misses, new_line = .true.)
1090 call messages_write(' Volume of allocations =')
1091 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1092 new_line = .true.)
1093 call messages_write(' Hit ratio =')
1094 if (hits + misses > 0) then
1095 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1096 else
1097 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1098 end if
1099 call messages_write('%', new_line = .true.)
1100 call messages_write(' Volume hit ratio =')
1101 if (volume_hits + volume_misses > 0) then
1102 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1103 else
1104 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1105 end if
1106 call messages_write('%')
1107 call messages_new_line()
1108 call messages_info()
1109
1110 call messages_print_with_emphasis(namespace=namespace)
1111 end if
1112
1114
1115#ifdef HAVE_CLBLAS
1116 call clblasteardown()
1117#endif
1118
1119#ifdef HAVE_CLFFT
1120 call clfftteardown()
1121#endif
1122
1123 if (accel_is_enabled()) then
1124#ifdef HAVE_CUDA
1125 call cublas_end(accel%cublas_handle)
1126 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1127 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1128 end if
1129#endif
1130
1131#ifdef HAVE_OPENCL
1132 call clreleasecommandqueue(accel%command_queue, ierr)
1133
1134 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1135 call clreleasecontext(accel%context%cl_context, cl_status)
1136#endif
1137
1138 if (buffer_alloc_count /= 0) then
1139 call messages_write('Accel:')
1140 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1141 call messages_write(' in ')
1142 call messages_write(buffer_alloc_count)
1143 call messages_write(' buffers were not deallocated.')
1144 call messages_fatal()
1145 end if
1146
1147 end if
1148
1149 pop_sub(accel_end)
1150 end subroutine accel_end
1151
1152 ! ------------------------------------------
1154 integer(int64) function accel_padded_size_i8(nn) result(psize)
1155 integer(int64), intent(in) :: nn
1156
1157 integer(int64) :: modnn, bsize
1158
1159 psize = nn
1160
1161 if (accel_is_enabled()) then
1162
1163 bsize = accel_max_workgroup_size()
1164
1165 psize = nn
1166 modnn = mod(nn, bsize)
1167 if (modnn /= 0) psize = psize + bsize - modnn
1168
1169 end if
1170
1171 end function accel_padded_size_i8
1172
1173 ! ------------------------------------------
1174
1175 integer(int32) function accel_padded_size_i4(nn) result(psize)
1176 integer(int32), intent(in) :: nn
1177
1178 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1179
1180 end function accel_padded_size_i4
1181
1182 ! ------------------------------------------
1183
1184 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
1185 type(accel_mem_t), intent(inout) :: this
1186 integer, intent(in) :: flags
1187 type(type_t), intent(in) :: type
1188 integer, intent(in) :: size
1189 logical, optional, intent(in) :: set_zero
1190 logical, optional, intent(in) :: async
1191
1192 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
1193 end subroutine accel_create_buffer_4
1194
1195 ! ------------------------------------------
1196
1197 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
1198 type(accel_mem_t), intent(inout) :: this
1199 integer, intent(in) :: flags
1200 type(type_t), intent(in) :: type
1201 integer(int64), intent(in) :: size
1202 logical, optional, intent(in) :: set_zero
1203 logical, optional, intent(in) :: async
1204
1205 integer(int64) :: fsize
1206 logical :: found
1207 integer(int64) :: initialize_buffers
1208#ifdef HAVE_OPENCL
1209 integer :: ierr
1210#endif
1211
1212 push_sub(accel_create_buffer_8)
1213
1214 this%type = type
1215 this%size = size
1216 this%flags = flags
1217 fsize = int(size, int64)*types_get_size(type)
1218 this%allocated = .true.
1219
1220 if (fsize > 0) then
1221
1222 call alloc_cache_get(memcache, fsize, found, this%mem)
1223
1224 if (.not. found) then
1225#ifdef HAVE_OPENCL
1226 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1227 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1228#endif
1229#ifdef HAVE_CUDA
1230 if(optional_default(async, .false.)) then
1231 call cuda_mem_alloc_async(this%mem, fsize)
1232 else
1233 call cuda_mem_alloc(this%mem, fsize)
1234 end if
1235#endif
1236 end if
1237
1238 buffer_alloc_count = buffer_alloc_count + 1
1239 allocated_mem = allocated_mem + fsize
1240
1241 end if
1242
1243 if (present(set_zero)) then
1244 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
1245 else
1246 initialize_buffers = accel%initialize_buffers
1247 end if
1248 select case (initialize_buffers)
1249 case (option__initializegpubuffers__yes)
1250 call accel_set_buffer_to(this, type, int(z'00', int8), size)
1251 case (option__initializegpubuffers__nan)
1252 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
1253 end select
1254
1255 pop_sub(accel_create_buffer_8)
1256 end subroutine accel_create_buffer_8
1257
1258 ! ------------------------------------------
1259
1260 subroutine accel_release_buffer(this, async)
1261 type(accel_mem_t), intent(inout) :: this
1262 logical, optional, intent(in) :: async
1263
1264#ifdef HAVE_OPENCL
1265 integer :: ierr
1266#endif
1267 logical :: put
1268 integer(int64) :: fsize
1269
1270 push_sub(accel_release_buffer)
1271
1272 if (this%size > 0) then
1273
1274 fsize = int(this%size, int64)*types_get_size(this%type)
1275
1276 call alloc_cache_put(memcache, fsize, this%mem, put)
1277
1278 if (.not. put) then
1279#ifdef HAVE_OPENCL
1280 call clreleasememobject(this%mem, ierr)
1281 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1282#endif
1283#ifdef HAVE_CUDA
1284 if (optional_default(async, .false.)) then
1285 call cuda_mem_free_async(this%mem)
1286 else
1287 call cuda_mem_free(this%mem)
1288 end if
1289#endif
1290 end if
1291
1292 buffer_alloc_count = buffer_alloc_count - 1
1293 allocated_mem = allocated_mem + fsize
1294
1295 end if
1296
1297 this%size = 0
1298 this%flags = 0
1299
1300 this%allocated = .false.
1301
1302 pop_sub(accel_release_buffer)
1303 end subroutine accel_release_buffer
1304
1305 ! ------------------------------------------------------
1306
1307 ! Check if the temporary buffers are the right size, if not reallocate them
1308 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
1309 type(accel_mem_t), intent(inout) :: buffer
1310 integer, intent(in) :: flags
1311 type(type_t), intent(in) :: type
1312 integer, intent(in) :: required_size
1313 logical, intent(in) :: set_zero
1314 logical, optional, intent(in) :: async
1315
1316 push_sub(accel_ensure_buffer_size)
1317
1318 if (buffer%size < required_size) then
1319 call accel_release_buffer(buffer, async=optional_default(async, .false.))
1320 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
1321 end if
1322
1324 end subroutine accel_ensure_buffer_size
1325
1326 ! ------------------------------------------
1327
1328 logical pure function accel_buffer_is_allocated(this) result(allocated)
1329 type(accel_mem_t), intent(in) :: this
1330
1331 allocated = this%allocated
1332 end function accel_buffer_is_allocated
1333
1334 ! -----------------------------------------
1335
1336 subroutine accel_finish()
1337#ifdef HAVE_OPENCL
1338 integer :: ierr
1339#endif
1340
1341 ! no push_sub, called too frequently
1342
1343 if (accel_is_enabled()) then
1344#ifdef HAVE_OPENCL
1345 call clfinish(accel%command_queue, ierr)
1346 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1347#endif
1348#ifdef HAVE_CUDA
1350#endif
1351 end if
1352 end subroutine accel_finish
1353
1354 ! ------------------------------------------
1355
1356 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1357 type(accel_kernel_t), intent(inout) :: kernel
1358 integer, intent(in) :: narg
1359 type(accel_mem_t), intent(in) :: buffer
1360
1361#ifdef HAVE_OPENCL
1362 integer :: ierr
1363#endif
1364
1365 assert(accel_buffer_is_allocated(buffer))
1366
1367 ! no push_sub, called too frequently
1368#ifdef HAVE_OPENCL
1369 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1370 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1371#endif
1372
1373#ifdef HAVE_CUDA
1374 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1375#endif
1376
1377 end subroutine accel_set_kernel_arg_buffer
1378
1379 ! ------------------------------------------
1380
1381 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1382 type(accel_kernel_t), intent(inout) :: kernel
1383 integer, intent(in) :: narg
1384 type(type_t), intent(in) :: type
1385 integer, intent(in) :: size
1386
1387#ifdef HAVE_OPENCL
1388 integer :: ierr
1389#endif
1390 integer(int64) :: size_in_bytes
1391
1394
1395 size_in_bytes = int(size, int64)*types_get_size(type)
1396
1397 if (size_in_bytes > accel%local_memory_size) then
1398 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1399 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1400 call messages_fatal(2)
1401 else if (size_in_bytes <= 0) then
1402 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1403 call messages_fatal(1)
1404 end if
1405
1406#ifdef HAVE_CUDA
1407 kernel%cuda_shared_mem = size_in_bytes
1408#endif
1410#ifdef HAVE_OPENCL
1411 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1412 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1413#endif
1414
1416 end subroutine accel_set_kernel_arg_local
1417
1418 ! ------------------------------------------
1419
1420 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1421 type(accel_kernel_t), intent(inout) :: kernel
1422 integer(int64), intent(in) :: globalsizes(:)
1423 integer(int64), intent(in) :: localsizes(:)
1424
1425 integer :: dim
1426#ifdef HAVE_OPENCL
1427 integer :: ierr
1428#endif
1429 integer(int64) :: gsizes(1:3)
1430 integer(int64) :: lsizes(1:3)
1431
1432 ! no push_sub, called too frequently
1433
1434 ! cuda needs all dimensions
1435 gsizes = 1
1436 lsizes = 1
1437
1438 dim = ubound(globalsizes, dim=1)
1439
1440 assert(dim == ubound(localsizes, dim=1))
1441
1442 ! if one size is zero, there is nothing to do
1443 if (any(globalsizes == 0)) return
1444
1445 assert(all(localsizes > 0))
1446 assert(all(localsizes <= accel_max_workgroup_size()))
1447 assert(all(mod(globalsizes, localsizes) == 0))
1448
1449 gsizes(1:dim) = globalsizes(1:dim)
1450 lsizes(1:dim) = localsizes(1:dim)
1452#ifdef HAVE_OPENCL
1453 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1454 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1455#endif
1456
1457#ifdef HAVE_CUDA
1458 ! Maximum dimension of a block
1459 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1460 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1461 message(2) = "The following conditions should be fulfilled:"
1462 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1463 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1464 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1465 message(6) = "This is an internal error, please contact the developers."
1466 call messages_fatal(6)
1467 end if
1468
1469
1470 ! Maximum number of threads per block
1471 if (product(lsizes) > accel_max_workgroup_size()) then
1472 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1473 message(2) = "The following condition should be fulfilled:"
1474 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1475 message(4) = "This is an internal error, please contact the developers."
1476 call messages_fatal(4)
1477 end if
1479 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1480
1481 ! Maximum dimensions of the grid of thread block
1482 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1483 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1484 message(2) = "The following conditions should be fulfilled:"
1485 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1486 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1487 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1488 message(6) = "This is an internal error, please contact the developers."
1489 call messages_fatal(6)
1490 end if
1491
1492 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1494 kernel%cuda_shared_mem = 0
1495#endif
1496
1497 end subroutine accel_kernel_run_8
1498
1499 ! -----------------------------------------------
1500
1501 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1502 type(accel_kernel_t), intent(inout) :: kernel
1503 integer, intent(in) :: globalsizes(:)
1504 integer, intent(in) :: localsizes(:)
1505
1506 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1507
1508 end subroutine accel_kernel_run_4
1509
1510 ! -----------------------------------------------
1511
1512 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1513 max_workgroup_size = accel%max_workgroup_size
1514 end function accel_max_workgroup_size
1515
1516 ! -----------------------------------------------
1517
1518 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1519 type(accel_kernel_t), intent(inout) :: kernel
1520
1521#ifdef HAVE_OPENCL
1522 integer(int64) :: workgroup_size8
1523 integer :: ierr
1524#endif
1525#ifdef HAVE_CUDA
1526 integer :: max_workgroup_size
1527#endif
1528
1529 workgroup_size = 0
1530
1531#ifdef HAVE_OPENCL
1532 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1533 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1534 workgroup_size = workgroup_size8
1535#endif
1536
1537#ifdef HAVE_CUDA
1538 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1539 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1540 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1541 "than available on the device (", accel%max_workgroup_size, ")"
1542 call messages_info(1)
1543 end if
1544 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1545 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1546 workgroup_size = 256
1547 ! make sure we do not use more threads per block than available for this kernel
1548 workgroup_size = min(workgroup_size, max_workgroup_size)
1549#endif
1550
1551 end function accel_kernel_workgroup_size
1552
1553 ! -----------------------------------------------
1554
1555#ifdef HAVE_OPENCL
1556 subroutine opencl_build_program(prog, filename, flags)
1557 type(cl_program), intent(inout) :: prog
1558 character(len=*), intent(in) :: filename
1559 character(len=*), optional, intent(in) :: flags
1560
1561 character(len = 1000) :: string
1562 character(len = 256) :: share_string
1563 integer :: ierr, ierrlog, iunit, irec, newlen
1564
1565 push_sub(opencl_build_program)
1566
1567 string = '#include "'//trim(filename)//'"'
1568
1569 call messages_write("Building CL program '"//trim(filename)//"'.")
1570 call messages_info(debug_only=.true.)
1571
1572 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1573 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1574
1575 ! build the compilation flags
1576 string='-w'
1577 ! full optimization
1578 string=trim(string)//' -cl-denorms-are-zero'
1579 ! The following flag gives an error with the Xeon Phi
1580 ! string=trim(string)//' -cl-strict-aliasing'
1581 string=trim(string)//' -cl-mad-enable'
1582 string=trim(string)//' -cl-unsafe-math-optimizations'
1583 string=trim(string)//' -cl-finite-math-only'
1584 string=trim(string)//' -cl-fast-relaxed-math'
1585
1586 share_string='-I'//trim(conf%share)//'/opencl/'
1587
1588 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1589 string = trim(string)//' -DEXT_KHR_FP64'
1590 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1591 string = trim(string)//' -DEXT_AMD_FP64'
1592 else
1593 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1594 call messages_fatal()
1595 end if
1596
1597 if (accel_use_shared_mem()) then
1598 string = trim(string)//' -DSHARED_MEM'
1599 end if
1600
1601 if (present(flags)) then
1602 string = trim(string)//' '//trim(flags)
1603 end if
1605 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1606 call messages_write(' '//trim(share_string)//"'.")
1607 call messages_info(debug_only=.true.)
1608
1609 string = trim(string)//' '//trim(share_string)
1610
1611 call clbuildprogram(prog, trim(string), ierr)
1612
1613 if(ierr /= cl_success) then
1614 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1615 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1616
1617 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1618 newlen = scan(string, achar(010), back = .true.) - 1
1619 if (newlen >= 0) string = string(1:newlen)
1620
1621 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1622
1623 call opencl_print_error(ierr, "clBuildProgram")
1624 end if
1625
1626 pop_sub(opencl_build_program)
1627 end subroutine opencl_build_program
1628#endif
1629
1630 ! -----------------------------------------------
1631#ifdef HAVE_OPENCL
1632 subroutine opencl_release_program(prog)
1633 type(cl_program), intent(inout) :: prog
1634
1635 integer :: ierr
1636
1637 push_sub(opencl_release_program)
1638
1639 call clreleaseprogram(prog, ierr)
1640 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1641
1642 pop_sub(opencl_release_program)
1643 end subroutine opencl_release_program
1644#endif
1645
1646 ! -----------------------------------------------
1647
1648#ifdef HAVE_OPENCL
1649 subroutine opencl_release_kernel(prog)
1650 type(cl_kernel), intent(inout) :: prog
1651
1652 integer :: ierr
1653
1654 push_sub(opencl_release_kernel)
1655
1656#ifdef HAVE_OPENCL
1657 call clreleasekernel(prog, ierr)
1658 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1659#endif
1660
1661 pop_sub(opencl_release_kernel)
1662 end subroutine opencl_release_kernel
1663#endif
1664
1665#ifdef HAVE_OPENCL
1666 ! -----------------------------------------------
1667 subroutine opencl_create_kernel(kernel, prog, name)
1668 type(cl_kernel), intent(inout) :: kernel
1669 type(cl_program), intent(inout) :: prog
1670 character(len=*), intent(in) :: name
1671
1672 integer :: ierr
1673
1674 push_sub(opencl_create_kernel)
1675 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1676
1677#ifdef HAVE_OPENCL
1678 kernel = clcreatekernel(prog, name, ierr)
1679 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1680#endif
1681
1682 call profiling_out("CL_BUILD_KERNEL")
1683 pop_sub(opencl_create_kernel)
1684 end subroutine opencl_create_kernel
1685#endif
1686
1687 ! ------------------------------------------------
1688#ifdef HAVE_OPENCL
1689 subroutine opencl_print_error(ierr, name)
1690 integer, intent(in) :: ierr
1691 character(len=*), intent(in) :: name
1692
1693 character(len=40) :: errcode
1694
1695 push_sub(opencl_print_error)
1696
1697 select case (ierr)
1698 case (cl_success); errcode = 'CL_SUCCESS '
1699 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1700 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1701 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1702 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1703 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1704 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1705 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1706 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1707 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1708 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1709 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1710 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1711 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1712 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1713 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1714 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1715 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1716 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1717 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1718 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1719 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1720 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1721 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1722 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1723 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1724 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1725 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1726 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1727 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1728 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1729 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1730 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1731 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1732 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1733 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1734 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1735 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1736 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1737 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1738 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1739 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1740 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1741 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1742 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1743 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1744 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1745 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1746 case default
1747 write(errcode, '(i10)') ierr
1748 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1749 end select
1750
1751 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1752 call messages_fatal(1)
1753
1754 pop_sub(opencl_print_error)
1755 end subroutine opencl_print_error
1756#endif
1757
1758 ! ----------------------------------------------------
1759
1760 subroutine clblas_print_error(ierr, name)
1761 integer, intent(in) :: ierr
1762 character(len=*), intent(in) :: name
1763
1764 character(len=40) :: errcode
1765
1766 push_sub(clblas_print_error)
1767#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1768 select case (ierr)
1769 case (clblassuccess); errcode = 'clblasSuccess'
1770 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1771 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1772 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1773 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1774 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1775 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1776 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1777 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1778 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1779 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1780 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1781 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1782 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1783 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1784 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1785 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1786 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1787 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1788 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1789 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1790 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1791 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1792 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1793 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1794 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1795 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1796 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1797 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1798 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1799#ifdef HAVE_CLBLAST
1800 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1801 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1802 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1803 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1804 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1805 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1806 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1807 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1808 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1809 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1810 case (clblastunknownerror); errcode = 'clblastUnknownError'
1811 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1812#endif
1813
1814 case default
1815 write(errcode, '(i10)') ierr
1816 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1817 end select
1818#endif
1819
1820 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1821 call messages_fatal(1)
1823 pop_sub(clblas_print_error)
1824 end subroutine clblas_print_error
1825
1826 ! ----------------------------------------------------
1827 subroutine clfft_print_error(ierr, name)
1828 integer, intent(in) :: ierr
1829 character(len=*), intent(in) :: name
1830
1831 character(len=40) :: errcode
1832
1833 push_sub(clfft_print_error)
1834#ifdef HAVE_CLFFT
1835 select case (ierr)
1836 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1837 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1838 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1839 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1840 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1841 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1842 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1843 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1844 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1845 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1846 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1847 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1848 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1849 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1850 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1851 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1852 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1853 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1854 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1855 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1856 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1857 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1858 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1859 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1860 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1861 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1862 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1863 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1864 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1865 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1866 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1867 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1868 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1869 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1870 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1871 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1872 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1873 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1874 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1875 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1876 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1877 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1878 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1879 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1880 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1881 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1882 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1883 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1884 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1885 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1886 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1887 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1888 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1889 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1890 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1891 case default
1892 write(errcode, '(i10)') ierr
1893 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1894 end select
1895#endif
1896
1897 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1898 call messages_fatal(1)
1899
1900 pop_sub(clfft_print_error)
1901 end subroutine clfft_print_error
1902
1903 ! ----------------------------------------------------
1904
1905#ifdef HAVE_OPENCL
1906 logical function f90_cl_device_has_extension(device, extension) result(has)
1907 type(cl_device_id), intent(inout) :: device
1908 character(len=*), intent(in) :: extension
1909
1910 integer :: cl_status
1911 character(len=2048) :: all_extensions
1912
1913#ifdef HAVE_OPENCL
1914 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1915#endif
1916
1917 has = index(all_extensions, extension) /= 0
1918
1919 end function f90_cl_device_has_extension
1920#endif
1921
1922 ! ----------------------------------------------------
1923
1924 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1925 type(accel_mem_t), intent(inout) :: buffer
1926 type(type_t), intent(in) :: type
1927 integer(int8), intent(in) :: val
1928 integer(int64), intent(in) :: nval
1929 integer(int64), optional, intent(in) :: offset
1930 logical, optional, intent(in) :: async
1931
1932#ifdef HAVE_OPENCL
1933 integer :: ierr
1934#endif
1935 integer(int64) :: nval_, offset_, type_size
1936
1937 push_sub(accel_set_buffer_to)
1938
1939 if (nval == 0) then
1940 pop_sub(accel_set_buffer_to)
1941 return
1942 end if
1943 assert(nval > 0)
1944
1945 if (present(offset)) then
1946 assert(offset >= 0)
1947 if(offset > buffer%size) then
1948 pop_sub(accel_set_buffer_to)
1949 return
1950 end if
1951 end if
1952
1953 type_size = types_get_size(type)
1955 nval_ = nval*type_size
1956
1957 offset_ = 0_int64
1958 if (present(offset)) offset_ = offset*type_size
1959
1960#ifdef HAVE_OPENCL
1961 call clenqueuefillbuffer(accel%command_queue, buffer%mem, val, offset_, nval_, ierr)
1962 if (ierr /= cl_success) call opencl_print_error(ierr, "clEnqueueFillBuffer")
1963#else
1964 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1965#endif
1966 if(.not. optional_default(async, .false.)) call accel_finish()
1967
1968 pop_sub(accel_set_buffer_to)
1969 end subroutine accel_set_buffer_to
1970
1971 ! ----------------------------------------------------
1972
1973 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1974 type(accel_mem_t), intent(inout) :: buffer
1975 type(type_t), intent(in) :: type
1976 integer(int64), intent(in) :: nval
1977 integer(int64), optional, intent(in) :: offset
1978 logical, optional, intent(in) :: async
1979
1981
1982 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1983
1985 end subroutine accel_set_buffer_to_zero_i8
1986
1987 ! ----------------------------------------------------
1988
1989 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1990 type(accel_mem_t), intent(inout) :: buffer
1991 type(type_t), intent(in) :: type
1992 integer(int32), intent(in) :: nval
1993 integer(int32), optional, intent(in) :: offset
1994 logical, optional, intent(in) :: async
1995
1997
1998 if (present(offset)) then
1999 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
2000 else
2001 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
2002 end if
2003
2005 end subroutine accel_set_buffer_to_zero_i4
2006
2007 ! ----------------------------------------------------
2008
2009 subroutine opencl_check_bandwidth()
2010 integer :: itime
2011 integer, parameter :: times = 10
2012 integer :: size
2013 real(real64) :: time, stime
2014 real(real64) :: read_bw, write_bw
2015 type(accel_mem_t) :: buff
2016 real(real64), allocatable :: data(:)
2017
2018 call messages_new_line()
2019 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
2020 call messages_new_line()
2021 call messages_info()
2022
2023 call messages_write(' Buffer size Read bw Write bw')
2024 call messages_new_line()
2025 call messages_write(' [MiB] [MiB/s] [MiB/s]')
2026 call messages_info()
2027
2028 size = 15000
2029 do
2030 safe_allocate(data(1:size))
2031 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
2032
2033 stime = loct_clock()
2034 do itime = 1, times
2035 call accel_write_buffer(buff, size, data)
2036 call accel_finish()
2037 end do
2038 time = (loct_clock() - stime)/real(times, real64)
2039
2040 write_bw = real(size, real64) *8.0_real64/time
2042 stime = loct_clock()
2043 do itime = 1, times
2044 call accel_read_buffer(buff, size, data)
2045 end do
2046 call accel_finish()
2047
2048 time = (loct_clock() - stime)/real(times, real64)
2049 read_bw = real(size, real64) *8.0_real64/time
2050
2051 call messages_write(size*8.0_real64/1024.0_real64**2)
2052 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
2053 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
2054 call messages_info()
2055
2056 call accel_release_buffer(buff)
2057
2058 safe_deallocate_a(data)
2059
2060 size = int(size*2.0)
2061
2062 if (size > 50000000) exit
2063 end do
2064 end subroutine opencl_check_bandwidth
2065
2066 ! ----------------------------------------------------
2067
2068 logical pure function accel_use_shared_mem() result(use_shared_mem)
2069
2070 use_shared_mem = accel%shared_mem
2071
2072 end function accel_use_shared_mem
2073
2074 !------------------------------------------------------------
2075
2076 subroutine accel_kernel_global_init()
2077
2078 push_sub(accel_kernel_global_init)
2079
2080 nullify(head)
2081
2082 call cuda_module_map_init(accel%module_map)
2083
2085 end subroutine accel_kernel_global_init
2086
2087 !------------------------------------------------------------
2088
2089 subroutine accel_kernel_global_end()
2090 type(accel_kernel_t), pointer :: next_head
2091
2092 push_sub(accel_kernel_global_end)
2093
2094 do
2095 if (.not. associated(head)) exit
2096 next_head => head%next
2098 head => next_head
2099 end do
2100
2101 if (accel_is_enabled()) then
2102 call cuda_module_map_end(accel%module_map)
2103 end if
2106 end subroutine accel_kernel_global_end
2107
2108 !------------------------------------------------------------
2109
2110 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
2111 type(accel_kernel_t), intent(inout) :: this
2112 character(len=*), intent(in) :: file_name
2113 character(len=*), intent(in) :: kernel_name
2114 character(len=*), optional, intent(in) :: flags
2115
2116#ifdef HAVE_OPENCL
2117 type(cl_program) :: prog
2118#endif
2119#ifdef HAVE_CUDA
2120 character(len=1000) :: all_flags
2121#endif
2122
2123 push_sub(accel_kernel_build)
2124
2125 call profiling_in("ACCEL_COMPILE", exclude = .true.)
2127#ifdef HAVE_CUDA
2128 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
2129
2130 if (accel_use_shared_mem()) then
2131 all_flags = trim(all_flags)//' -DSHARED_MEM'
2132 end if
2133
2134 if (present(flags)) then
2135 all_flags = trim(all_flags)//' '//trim(flags)
2136 end if
2137
2138 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
2139
2140 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2141 call cuda_alloc_arg_array(this%arguments)
2142
2143 this%cuda_shared_mem = 0
2144#endif
2145
2146#ifdef HAVE_OPENCL
2147 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2148 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2149 call opencl_release_program(prog)
2150#endif
2151
2152 this%initialized = .true.
2153 this%kernel_name = trim(kernel_name)
2154
2155 call profiling_out("ACCEL_COMPILE")
2156
2157 pop_sub(accel_kernel_build)
2158 end subroutine accel_kernel_build
2159
2160 !------------------------------------------------------------
2161
2162 subroutine accel_kernel_end(this)
2163 type(accel_kernel_t), intent(inout) :: this
2164#ifdef HAVE_OPENCL
2165 integer :: ierr
2166#endif
2167
2168 push_sub(accel_kernel_end)
2169
2170#ifdef HAVE_CUDA
2171 call cuda_free_arg_array(this%arguments)
2172 call cuda_release_kernel(this%cuda_kernel)
2173 ! modules are not released here, since they are not associated to a kernel
2174#endif
2175
2176#ifdef HAVE_OPENCL
2177 call clreleasekernel(this%kernel, ierr)
2178 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2179#endif
2180 this%initialized = .false.
2181
2182 pop_sub(accel_kernel_end)
2183 end subroutine accel_kernel_end
2184
2185 !------------------------------------------------------------
2186
2187 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2188 type(accel_kernel_t), target, intent(inout) :: this
2189 character(len=*), intent(in) :: file_name
2190 character(len=*), intent(in) :: kernel_name
2191 character(len=*), optional, intent(in) :: flags
2192
2193 push_sub(accel_kernel_start_call)
2195 if (.not. this%initialized) then
2196 call accel_kernel_build(this, file_name, kernel_name, flags)
2197 this%next => head
2198 head => this
2199 end if
2200
2202 end subroutine accel_kernel_start_call
2203
2204 !--------------------------------------------------------------
2205
2206 integer(int64) pure function accel_global_memory_size() result(size)
2207
2208 size = accel%global_memory_size
2209
2210 end function accel_global_memory_size
2211
2212 !--------------------------------------------------------------
2213
2214 integer(int64) pure function accel_local_memory_size() result(size)
2215
2216 size = accel%local_memory_size
2217
2218 end function accel_local_memory_size
2219
2220 !--------------------------------------------------------------
2222 integer pure function accel_max_size_per_dim(dim) result(size)
2223 integer, intent(in) :: dim
2224
2225 size = 0
2226#ifdef HAVE_OPENCL
2227 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2228 ! value of zero when multiplied by 2048 and converted to integer 4.
2229 if (dim == 1) size = 2**30
2230#endif
2231#ifdef HAVE_CUDA
2232 size = 32768
2233 if (dim == 1) size = 2**30
2234#endif
2235 end function accel_max_size_per_dim
2236
2237 ! ------------------------------------------------------
2238
2239 subroutine accel_set_stream(stream_number)
2240 integer, intent(in) :: stream_number
2241
2242 push_sub(accel_set_stream)
2243
2244 if (accel_is_enabled()) then
2245#ifdef HAVE_CUDA
2246 call cuda_set_stream(accel%cuda_stream, stream_number)
2247 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2248#endif
2249 end if
2250
2251 pop_sub(accel_set_stream)
2252 end subroutine accel_set_stream
2253
2254 ! ------------------------------------------------------
2255
2256 subroutine accel_get_stream(stream_number)
2257 integer, intent(inout) :: stream_number
2258
2259 push_sub(accel_get_stream)
2260
2261 if (accel_is_enabled()) then
2262#ifdef HAVE_CUDA
2263 call cuda_get_stream(stream_number)
2264#endif
2265 end if
2266
2267 pop_sub(accel_get_stream)
2268 end subroutine accel_get_stream
2269
2270 ! ------------------------------------------------------
2271
2274
2275 if (accel_is_enabled()) then
2276#ifdef HAVE_CUDA
2277 call cuda_synchronize_all_streams()
2278#endif
2279 end if
2280
2282 end subroutine accel_synchronize_all_streams
2283
2284 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2285 type(c_ptr), intent(in) :: buffer
2286 integer(int64), intent(in) :: offset
2287 type(c_ptr) :: buffer_offset
2288
2290#ifdef HAVE_CUDA
2291 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2292#else
2293 ! this is needed to make the compiler happy for non-GPU compilations
2294 buffer_offset = buffer
2295#endif
2298
2299 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2300 type(c_ptr), intent(in) :: buffer
2301 integer(int64), intent(in) :: offset
2302 type(c_ptr) :: buffer_offset
2303
2305#ifdef HAVE_CUDA
2306 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2307#else
2308 ! this is needed to make the compiler happy for non-GPU compilations
2309 buffer_offset = buffer
2310#endif
2314 subroutine accel_clean_pointer(buffer)
2315 type(c_ptr), intent(in) :: buffer
2316
2317 push_sub(accel_clean_pointer)
2318#ifdef HAVE_CUDA
2319 call cuda_clean_pointer(buffer)
2320#endif
2321 pop_sub(accel_clean_pointer)
2322 end subroutine accel_clean_pointer
2323
2327 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2328 integer(int64), intent(in) :: size
2329 integer(int64), intent(out) :: grid_size
2330 integer(int64), intent(out) :: thread_block_size
2331
2332 push_sub(accel_get_unfolded_size)
2333#ifdef __HIP_PLATFORM_AMD__
2334 ! not benefitial for AMD chips
2335 grid_size = size
2336 thread_block_size = size
2337#else
2338 grid_size = size * accel%warp_size
2339 thread_block_size = accel%warp_size
2340#endif
2342 end subroutine accel_get_unfolded_size
2343
2344#include "undef.F90"
2345#include "real.F90"
2346#include "accel_inc.F90"
2347
2348#include "undef.F90"
2349#include "complex.F90"
2350#include "accel_inc.F90"
2351
2352#include "undef.F90"
2353#include "integer.F90"
2354#include "accel_inc.F90"
2355
2356#include "undef.F90"
2357#include "integer8.F90"
2358#include "accel_inc.F90"
2359
2360end module accel_oct_m
2361
2362!! Local Variables:
2363!! mode: f90
2364!! coding: utf-8
2365!! End:
subroutine device_info()
Definition: accel.F90:687
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4339
integer, parameter opencl_accelerator
Definition: accel.F90:393
type(accel_kernel_t), target, save, public kernel_density_real
Definition: accel.F90:291
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2872
integer, parameter opencl_default
Definition: accel.F90:393
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:278
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:299
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1494
subroutine accel_kernel_global_end()
Definition: accel.F90:1319
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2416
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1507
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4079
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3107
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:428
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2059
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:1298
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2781
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2917
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3861
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1795
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
Definition: accel.F90:279
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4226
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2545
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2314
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1881
type(accel_kernel_t), target, save, public kernel_ghost_reorder
Definition: accel.F90:290
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3616
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2603
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4250
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3239
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:300
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2634
integer, parameter cl_plat_nvidia
Definition: accel.F90:400
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3088
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2944
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3162
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1375
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3138
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3666
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3457
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2478
integer, parameter cl_plat_ati
Definition: accel.F90:400
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1436
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:852
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2677
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1394
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3811
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:302
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1862
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1663
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2764
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2172
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3356
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1021
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2150
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:822
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3990
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1605
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3639
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2658
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3571
subroutine, public accel_finish()
Definition: accel.F90:984
subroutine opencl_check_bandwidth()
Definition: accel.F90:1239
subroutine accel_kernel_global_init()
Definition: accel.F90:1306
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2735
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2366
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3503
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4362
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:956
type(accel_kernel_t), target, save, public zzmul
Definition: accel.F90:306
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1154
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4209
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1644
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1936
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2584
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1912
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:3759
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2083
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3418
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2440
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2459
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1203
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2849
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:976
integer, parameter, public accel_mem_read_write
Definition: accel.F90:195
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1136
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2195
subroutine accel_kernel_end(this)
Definition: accel.F90:1360
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:301
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3962
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2805
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1694
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4141
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1464
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3036
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3527
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1410
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3267
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1823
type(accel_kernel_t), target, save, public dzmul
Definition: accel.F90:305
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3399
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3885
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2385
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4103
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4317
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3549
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2517
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2042
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3306
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1051
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
Definition: accel.F90:280
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1340
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:438
subroutine, public accel_end(namespace)
Definition: accel.F90:745
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3772
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1452
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1419
subroutine, public accel_release_buffer(this, async)
Definition: accel.F90:918
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:4048
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:289
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2222
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4294
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3380
integer, parameter cl_plat_amd
Definition: accel.F90:400
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:843
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1219
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3904
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3594
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3200
pure logical function, public accel_is_enabled()
Definition: accel.F90:418
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1718
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1756
integer, parameter cl_plat_intel
Definition: accel.F90:400
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3181
integer, parameter, public accel_mem_write_only
Definition: accel.F90:195
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2013
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4122
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1955
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:277
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2127
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1087
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4389
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3325
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3830
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3923
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2827
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1479
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4180
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1592
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1737
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1104
integer, parameter opencl_cpu
Definition: accel.F90:393
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2327
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1121
type(accel_t), public accel
Definition: accel.F90:270
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4272
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3486
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1974
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3049
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2894
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:865
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:4029
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:288
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:1402
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1004
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1098
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2696
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2105
type(accel_kernel_t), pointer head
Definition: accel.F90:412
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:190
complex(real64), parameter, public m_z0
Definition: global.F90:200
complex(real64), parameter, public m_z1
Definition: global.F90:201
real(real64), parameter, public m_one
Definition: global.F90:191
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:904
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:531
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1029
subroutine, public messages_new_line()
Definition: messages.F90:1118
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:416
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:697
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:600
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:625
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:554
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)