Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
47 use mpi_oct_m
49 use types_oct_m
50 use parser_oct_m
53
54 implicit none
55
56 private
57
58 public :: &
63 accel_t, &
66 accel_init, &
67 accel_end, &
96
97#ifdef HAVE_OPENCL
98 integer, public, parameter :: &
99 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
100 accel_mem_read_write = cl_mem_read_write, &
101 accel_mem_write_only = cl_mem_write_only
102#else
103 integer, public, parameter :: &
104 ACCEL_MEM_READ_ONLY = 0, &
107#endif
108
109 type accel_context_t
110 ! Components are public by default
111#ifdef HAVE_OPENCL
112 type(cl_context) :: cl_context
113#elif defined(HAVE_CUDA)
114 type(c_ptr) :: cuda_context
115#else
116 integer :: dummy
117#endif
118 end type accel_context_t
119
120 type accel_device_t
121 ! Components are public by default
122#ifdef HAVE_OPENCL
123 type(cl_device_id) :: cl_device
124#elif defined(HAVE_CUDA)
125 type(c_ptr) :: cuda_device
126#else
127 integer :: dummy
128#endif
129 end type accel_device_t
130
131 type accel_t
132 ! Components are public by default
133 type(accel_context_t) :: context
134 type(accel_device_t) :: device
135#ifdef HAVE_OPENCL
136 type(cl_command_queue) :: command_queue
137#endif
138 type(c_ptr) :: cublas_handle
139 type(c_ptr) :: cuda_stream
140 type(c_ptr) :: module_map
141 integer :: max_workgroup_size
142 integer(int64) :: local_memory_size
143 integer(int64) :: global_memory_size
144 logical :: enabled
145 logical :: allow_CPU_only
146 logical :: shared_mem
147 logical :: cuda_mpi
148 integer :: warp_size
149 logical :: initialize_buffers
150 character(len=32) :: debug_flag
151 integer(int64) :: max_block_dim(3)
152 integer(int64) :: max_grid_dim(3)
153 end type accel_t
154
155 type accel_mem_t
156 ! Components are public by default
157#ifdef HAVE_OPENCL
158 type(cl_mem) :: mem
159#else
160 type(c_ptr) :: mem
161#endif
162 integer(c_size_t) :: size = 0
163 type(type_t) :: type
164 integer :: flags = 0
165 logical :: allocated = .false.
166 end type accel_mem_t
167
168 type accel_kernel_t
169 ! Components are public by default
170#ifdef HAVE_OPENCL
171 type(cl_kernel) :: kernel
172#endif
173#ifdef HAVE_CUDA
174 type(c_ptr) :: cuda_kernel
175 type(c_ptr) :: cuda_module
176 type(c_ptr) :: arguments
177#endif
178 integer(int64) :: cuda_shared_mem
179 logical :: initialized = .false.
180 type(accel_kernel_t), pointer :: next
181 integer :: arg_count
182 character(len=128) :: kernel_name
184
185 type(accel_t), public :: accel
186
187 ! the kernels
188 type(accel_kernel_t), public, target, save :: kernel_vpsi
189 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
190 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
191 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
192 type(accel_kernel_t), public, target, save :: kernel_daxpy
193 type(accel_kernel_t), public, target, save :: kernel_zaxpy
194 type(accel_kernel_t), public, target, save :: kernel_copy
195 type(accel_kernel_t), public, target, save :: dpack
196 type(accel_kernel_t), public, target, save :: zpack
197 type(accel_kernel_t), public, target, save :: dunpack
198 type(accel_kernel_t), public, target, save :: zunpack
199 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
200 type(accel_kernel_t), public, target, save :: kernel_density_real
201 type(accel_kernel_t), public, target, save :: kernel_density_complex
202 type(accel_kernel_t), public, target, save :: kernel_density_spinors
203 type(accel_kernel_t), public, target, save :: kernel_phase
204 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
205 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
206 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
207 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
208 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
209 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
210 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
211 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
212 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
213 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
214 type(accel_kernel_t), public, target, save :: dzmul
215 type(accel_kernel_t), public, target, save :: zzmul
216 type(accel_kernel_t), public, target, save :: set_one
217
218 ! kernels used locally
220
223 end interface accel_padded_size
231 end interface accel_kernel_run
251 end interface accel_write_buffer
252
262 end interface accel_read_buffer
265 module procedure &
272 end interface accel_set_kernel_arg
290 integer, parameter :: &
297 integer, parameter :: &
302 cl_plat_intel = -5
303
304 ! a "convenience" public variable
305 integer, public :: cl_status
307 integer :: buffer_alloc_count
308 integer(int64) :: allocated_mem
309 type(accel_kernel_t), pointer :: head
310 type(alloc_cache_t) :: memcache
311
312contains
313
314 pure logical function accel_is_enabled() result(enabled)
315#ifdef HAVE_ACCEL
316 enabled = accel%enabled
317#else
318 enabled = .false.
319#endif
320 end function accel_is_enabled
321
322 ! ------------------------------------------
323
324 pure logical function accel_allow_cpu_only() result(allow)
325#ifdef HAVE_ACCEL
326 allow = accel%allow_CPU_only
327#else
328 allow = .true.
329#endif
330 end function accel_allow_cpu_only
331
332 ! ------------------------------------------
333
334 subroutine accel_init(base_grp, namespace)
335 type(mpi_grp_t), intent(inout) :: base_grp
336 type(namespace_t), intent(in) :: namespace
337
338 logical :: disable, default, run_benchmark
339 integer :: idevice, iplatform
340#ifdef HAVE_OPENCL
341 integer :: device_type
342 integer :: cl_status, idev
343 integer :: ndevices, ret_devices, nplatforms, iplat
344 character(len=256) :: device_name
345 type(cl_platform_id) :: platform_id
346 type(cl_program) :: prog
347 type(cl_platform_id), allocatable :: allplatforms(:)
348 type(cl_device_id), allocatable :: alldevices(:)
349#endif
350#ifdef HAVE_CUDA
351 integer :: dim
352#ifdef HAVE_MPI
353 character(len=256) :: sys_name
354#endif
355#endif
356
357 push_sub(accel_init)
358
359 buffer_alloc_count = 0
360
361 !%Variable DisableAccel
362 !%Type logical
363 !%Default yes
364 !%Section Execution::Accel
365 !%Description
366 !% If Octopus was compiled with OpenCL or CUDA support, it will
367 !% try to initialize and use an accelerator device. By setting this
368 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
369 !%End
370 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
371#ifdef HAVE_ACCEL
372 default = .false.
373#else
374 default = .true.
375#endif
376 call parse_variable(namespace, 'DisableAccel', default, disable)
377 accel%enabled = .not. disable
378
379#ifndef HAVE_ACCEL
380 if (accel%enabled) then
381 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
383 end if
384#endif
385
386 if (.not. accel_is_enabled()) then
387 pop_sub(accel_init)
388 return
389 end if
391 !%Variable AccelPlatform
392 !%Type integer
393 !%Default 0
394 !%Section Execution::Accel
395 !%Description
396 !% This variable selects the OpenCL platform that Octopus will
397 !% use. You can give an explicit platform number or use one of
398 !% the options that select a particular vendor
399 !% implementation. Platform 0 is used by default.
400 !%
401 !% This variable has no effect for CUDA.
402 !%Option amd -2
403 !% Use the AMD OpenCL platform.
404 !%Option nvidia -3
405 !% Use the Nvidia OpenCL platform.
406 !%Option ati -4
407 !% Use the ATI (old AMD) OpenCL platform.
408 !%Option intel -5
409 !% Use the Intel OpenCL platform.
410 !%End
411 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
412
413 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
414
415 !%Variable AccelDevice
416 !%Type integer
417 !%Default gpu
418 !%Section Execution::Accel
419 !%Description
420 !% This variable selects the OpenCL or CUDA accelerator device
421 !% that Octopus will use. You can specify one of the options below
422 !% or a numerical id to select a specific device.
423 !%
424 !% Values >= 0 select the device to be used. In case of MPI enabled runs
425 !% devices are distributed in a round robin fashion, starting at this value.
426 !%Option gpu -1
427 !% If available, Octopus will use a GPU.
428 !%Option cpu -2
429 !% If available, Octopus will use a CPU (only for OpenCL).
430 !%Option accelerator -3
431 !% If available, Octopus will use an accelerator (only for OpenCL).
432 !%Option accel_default -4
433 !% Octopus will use the default device specified by the implementation.
434 !% implementation.
435 !%End
436 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
437
438 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
439
440 if (idevice < opencl_default) then
441 call messages_write('Invalid AccelDevice')
442 call messages_fatal()
443 end if
444
445 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
446
447#ifdef HAVE_CUDA
448 if (idevice<0) idevice = 0
449 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
450 idevice, base_grp%rank)
451#ifdef HAVE_MPI
452 call loct_sysname(sys_name)
453 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
454 " on ", trim(sys_name)
455 call messages_info(1, all_nodes = .true.)
456#endif
457
458 ! no shared mem support in our cuda interface (for the moment)
459 accel%shared_mem = .true.
460
461 call cublas_init(accel%cublas_handle, accel%cuda_stream)
462#endif
463
464#ifdef HAVE_OPENCL
465 call profiling_in('CL_INIT')
466
467 call clgetplatformids(nplatforms, cl_status)
468 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
469
470 safe_allocate(allplatforms(1:nplatforms))
471
472 call clgetplatformids(allplatforms, iplat, cl_status)
473 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
474
475 call messages_write('Info: Available CL platforms: ')
476 call messages_write(nplatforms)
477 call messages_info()
478
479 do iplat = 1, nplatforms
480
481 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
482
483 if (iplatform < 0) then
484 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
485 end if
486
487 if (iplatform == iplat - 1) then
488 call messages_write(' * Platform ')
489 else
490 call messages_write(' Platform ')
491 end if
492
493 call messages_write(iplat - 1)
494 call messages_write(' : '//device_name)
495 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
496 call messages_write(' ('//trim(device_name)//')')
497 call messages_info()
498 end do
499
500 call messages_info()
501
502 if (iplatform >= nplatforms .or. iplatform < 0) then
503 call messages_write('Requested CL platform does not exist')
504 if (iplatform > 0) then
505 call messages_write('(platform = ')
506 call messages_write(iplatform)
507 call messages_write(').')
508 end if
509 call messages_fatal()
510 end if
511
512 platform_id = allplatforms(iplatform + 1)
513
514 safe_deallocate_a(allplatforms)
515
516 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
517
518 call messages_write('Info: Available CL devices: ')
519 call messages_write(ndevices)
520 call messages_info()
521
522 safe_allocate(alldevices(1:ndevices))
523
524 ! list all devices
525
526 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
527
528 do idev = 1, ndevices
529 call messages_write(' Device ')
530 call messages_write(idev - 1)
531 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
532 call messages_write(' : '//device_name)
533 call messages_info()
534 end do
535
536 select case (idevice)
537 case (opencl_gpu)
538 device_type = cl_device_type_gpu
539 case (opencl_cpu)
540 device_type = cl_device_type_cpu
541 case (opencl_accelerator)
542 device_type = cl_device_type_accelerator
543 case (opencl_default)
544 device_type = cl_device_type_default
545 case default
546 device_type = cl_device_type_all
547 end select
548
549 ! now get a list of the selected type
550 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
551
552 if (ret_devices < 1) then
553 ! we didnt find a device of the selected type, we ask for the default device
554 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
555
556 if (ret_devices < 1) then
557 ! if this does not work, we ask for all devices
558 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
559 end if
560
561 if (ret_devices < 1) then
562 call messages_write('Cannot find an OpenCL device')
563 call messages_fatal()
564 end if
565 end if
566
567 ! the number of devices can be smaller
568 ndevices = ret_devices
569
570 if (idevice < 0) then
571 if (base_grp%size > 1) then
572 ! with MPI we have to select the device so multiple GPUs in one
573 ! node are correctly distributed
574 call select_device(idevice)
575 else
576 idevice = 0
577 end if
578 end if
579
580 if (idevice >= ndevices) then
581 call messages_write('Requested CL device does not exist (device = ')
582 call messages_write(idevice)
583 call messages_write(', platform = ')
584 call messages_write(iplatform)
585 call messages_write(').')
586 call messages_fatal()
587 end if
588
589 accel%device%cl_device = alldevices(idevice + 1)
590
591 ! create the context
592 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
593 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
594
595 safe_deallocate_a(alldevices)
596
597 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
598 cl_queue_profiling_enable, cl_status)
599 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
600
601 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
602
603 select case (device_type)
604 case (cl_device_type_gpu)
605 accel%shared_mem = .true.
606 case (cl_device_type_cpu, cl_device_type_accelerator)
607 accel%shared_mem = .false.
608 case default
609 accel%shared_mem = .false.
610 end select
611
612#ifdef HAVE_CLBLAS
613 call clblassetup(cl_status)
614 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
615#endif
616
617#ifdef HAVE_CLFFT
618 call clfftsetup(cl_status)
619 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
620#endif
621
622 call profiling_out('CL_INIT')
623#endif
624
625 ! Get some device information that we will need later
626
627 ! total memory
628#ifdef HAVE_OPENCL
629 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
630 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
631 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
632 accel%warp_size = 1
633#endif
634#ifdef HAVE_CUDA
635 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
636 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
637 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
638 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
639 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
640 accel%max_block_dim(1) = int(dim, int64)
641 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
642 accel%max_block_dim(2) = int(dim, int64)
643 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
644 accel%max_block_dim(3) = int(dim, int64)
645 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
646 accel%max_grid_dim(1) = int(dim, int64)
647 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
648 accel%max_grid_dim(2) = int(dim, int64)
649 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
650 accel%max_grid_dim(3) = int(dim, int64)
651#endif
652
653 if (mpi_grp_is_root(base_grp)) call device_info()
654
655 ! initialize the cache used to speed up allocations
656 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
657
658 ! now initialize the kernels
660
661#if defined(HAVE_HIP)
662 accel%debug_flag = "-g"
663#elif defined(HAVE_CUDA)
664 accel%debug_flag = "-lineinfo"
665#elif defined(HAVE_OPENCL)
666 accel%debug_flag = "-g"
667#endif
668
669 call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
670 call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
671 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
672 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
673 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
674 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
675 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
676 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
677 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
678 flags = ' -DRTYPE_DOUBLE')
679 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
680 flags = '-DRTYPE_COMPLEX')
681 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
682 flags = '-DRTYPE_DOUBLE')
683 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
684 flags = '-DRTYPE_COMPLEX')
685 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
686 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
687 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
688 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
689 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
690 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
691 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
692 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
693 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
694 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
695 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
696 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
697 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
698 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
699 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
700
701
702 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
703 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
704
705 !%Variable AccelBenchmark
706 !%Type logical
707 !%Default no
708 !%Section Execution::Accel
709 !%Description
710 !% If this variable is set to yes, Octopus will run some
711 !% routines to benchmark the performance of the accelerator device.
712 !%End
713 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
714
715 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
716
717 if (run_benchmark) then
719 end if
720
721 !%Variable GPUAwareMPI
722 !%Type logical
723 !%Section Execution::Accel
724 !%Description
725 !% If Octopus was compiled with GPU support and MPI support and if the MPI
726 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
727 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
728 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
729 !% going through the host memory.
730 !% The default is false, except when the configure switch --enable-cudampi is set, in which
731 !% case this variable is set to true.
732 !%End
733#ifdef HAVE_CUDA_MPI
734 default = .true.
735#else
736 default = .false.
737#endif
738 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
739 if (accel%cuda_mpi) then
740#ifndef HAVE_CUDA_MPI
741 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
742 call messages_warning()
743#endif
744 call messages_write("Using GPU-aware MPI.")
745 call messages_info()
746 end if
747
748
749 !%Variable AllowCPUonly
750 !%Type logical
751 !%Section Execution::Accel
752 !%Description
753 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
754 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
755 !% code execution also in these cases.
756 !%End
757#if defined (HAVE_ACCEL)
758 default = .false.
759#else
760 default = .true.
761#endif
762 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
763
764
765 !%Variable InitializeGPUBuffers
766 !%Type logical
767 !%Section Execution::Accel
768 !%Description
769 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
770 !%End
771 call parse_variable(namespace, 'InitializeGPUBuffers', .false., accel%initialize_buffers)
772
773
774 call messages_print_with_emphasis(namespace=namespace)
775
776 pop_sub(accel_init)
777
778 contains
779
780#if defined(HAVE_OPENCL)
781 subroutine select_device(idevice)
782 integer, intent(inout) :: idevice
783 integer :: irank
784 character(len=256) :: device_name
785
786 push_sub(accel_init.select_device)
787
788 idevice = mod(base_grp%rank, ndevices)
789
790 call base_grp%barrier()
791 call messages_write('Info: CL device distribution:')
792 call messages_info()
793 do irank = 0, base_grp%size - 1
794 if (irank == base_grp%rank) then
795 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
796 call messages_write(' MPI node ')
797 call messages_write(base_grp%rank)
798 call messages_write(' -> CL device ')
799 call messages_write(idevice)
800 call messages_write(' : '//device_name)
801 call messages_info(all_nodes = .true.)
802 end if
803 call base_grp%barrier()
804 end do
805
807 end subroutine select_device
808#endif
809
810 subroutine device_info()
811#ifdef HAVE_OPENCL
812 integer(int64) :: val
813#endif
814#ifdef HAVE_CUDA
815 integer :: version
816#endif
817 integer :: major, minor
818 character(len=256) :: val_str
819
820 push_sub(accel_init.device_info)
821
822 call messages_new_line()
823 call messages_write('Selected device:')
824 call messages_new_line()
825
826#ifdef HAVE_OPENCL
827 call messages_write(' Framework : OpenCL')
828#endif
829#ifdef HAVE_CUDA
830#ifdef __HIP_PLATFORM_AMD__
831 call messages_write(' Framework : ROCm')
832#else
833 call messages_write(' Framework : CUDA')
834#endif
835#endif
836 call messages_info()
837
838#ifdef HAVE_CUDA
839 call messages_write(' Device type : GPU', new_line = .true.)
840#ifdef __HIP_PLATFORM_AMD__
841 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
842#else
843 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
844#endif
845#endif
846
847#ifdef HAVE_OPENCL
848 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
849 call messages_write(' Device type :')
850 select case (int(val, int32))
851 case (cl_device_type_gpu)
852 call messages_write(' GPU')
853 case (cl_device_type_cpu)
854 call messages_write(' CPU')
855 case (cl_device_type_accelerator)
856 call messages_write(' accelerator')
857 end select
858 call messages_new_line()
860 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
861 call messages_write(' Device vendor : '//trim(val_str))
862 call messages_new_line()
863#endif
864
865#ifdef HAVE_OPENCL
866 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
867#endif
868#ifdef HAVE_CUDA
869 call cuda_device_name(accel%device%cuda_device, val_str)
870#endif
871 call messages_write(' Device name : '//trim(val_str))
872 call messages_new_line()
873
874#ifdef HAVE_CUDA
875 call cuda_device_capability(accel%device%cuda_device, major, minor)
876#endif
877 call messages_write(' Cuda capabilities :')
878 call messages_write(major, fmt = '(i2)')
879 call messages_write('.')
880 call messages_write(minor, fmt = '(i1)')
881 call messages_new_line()
882
883 ! VERSION
884#ifdef HAVE_OPENCL
885 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
886 call messages_write(' Driver version : '//trim(val_str))
887#endif
888#ifdef HAVE_CUDA
889 call cuda_driver_version(version)
890 call messages_write(' Driver version : ')
891 call messages_write(version)
892#endif
893 call messages_new_line()
894
895
896#ifdef HAVE_OPENCL
897 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
898 call messages_write(' Compute units :')
899 call messages_write(val)
900 call messages_new_line()
901
902 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
903 call messages_write(' Clock frequency :')
904 call messages_write(val)
905 call messages_write(' GHz')
906 call messages_new_line()
907#endif
908
909 call messages_write(' Device memory :')
910 call messages_write(accel%global_memory_size, units=unit_megabytes)
911 call messages_new_line()
912
913 call messages_write(' Local/shared memory :')
914 call messages_write(accel%local_memory_size, units=unit_kilobytes)
915 call messages_new_line()
916
917
918#ifdef HAVE_OPENCL
919 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
920 call messages_write(' Max alloc size :')
921 call messages_write(val, units = unit_megabytes)
922 call messages_new_line()
923
924 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
925 call messages_write(' Device cache :')
926 call messages_write(val, units = unit_kilobytes)
927 call messages_new_line()
928
929 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
930 call messages_write(' Constant memory :')
931 call messages_write(val, units = unit_kilobytes)
932 call messages_new_line()
933#endif
934
935 call messages_write(' Max. group/block size :')
936 call messages_write(accel%max_workgroup_size)
937 call messages_new_line()
938
939
940#ifdef HAVE_OPENCL
941 call messages_write(' Extension cl_khr_fp64 :')
942 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
943 call messages_new_line()
944
945 call messages_write(' Extension cl_amd_fp64 :')
946 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
947 call messages_new_line()
948
949 call messages_write(' Extension cl_khr_int64_base_atomics :')
950 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
951 call messages_new_line()
952
953#endif
954
955 call messages_info()
956
957
958 pop_sub(accel_init.device_info)
959 end subroutine device_info
960
961 end subroutine accel_init
962
963 ! ------------------------------------------
964#ifdef HAVE_OPENCL
965 integer function get_platform_id(platform_name) result(platform_id)
966 character(len=*), intent(in) :: platform_name
967
968 platform_id = cl_plat_invalid
969 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
970 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
971 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
972 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
973 end function get_platform_id
974#endif
975 ! ------------------------------------------
976
977 subroutine accel_end(namespace)
978 type(namespace_t), intent(in) :: namespace
979
980#ifdef HAVE_OPENCL
981 integer :: ierr
982#endif
983 integer(int64) :: hits, misses
984 real(real64) :: volume_hits, volume_misses
985 logical :: found
986 type(accel_mem_t) :: tmp
987
988 push_sub(accel_end)
989
990 if (accel_is_enabled()) then
991
992 do
993 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
994 if (.not. found) exit
995
996#ifdef HAVE_OPENCL
997 call clreleasememobject(tmp%mem, ierr)
998 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
999#endif
1000#ifdef HAVE_CUDA
1001 call cuda_mem_free(tmp%mem)
1002#endif
1003 end do
1004
1005 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1006
1007 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1008
1009 call messages_new_line()
1010 call messages_write(' Number of allocations =')
1011 call messages_write(hits + misses, new_line = .true.)
1012 call messages_write(' Volume of allocations =')
1013 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1014 new_line = .true.)
1015 call messages_write(' Hit ratio =')
1016 if (hits + misses > 0) then
1017 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1018 else
1019 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1020 end if
1021 call messages_write('%', new_line = .true.)
1022 call messages_write(' Volume hit ratio =')
1023 if (volume_hits + volume_misses > 0) then
1024 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1025 else
1026 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1027 end if
1028 call messages_write('%')
1029 call messages_new_line()
1030 call messages_info()
1031
1032 call messages_print_with_emphasis(namespace=namespace)
1033 end if
1034
1036
1037#ifdef HAVE_CLBLAS
1038 call clblasteardown()
1039#endif
1040
1041#ifdef HAVE_CLFFT
1042 call clfftteardown()
1043#endif
1044
1045 if (accel_is_enabled()) then
1046#ifdef HAVE_CUDA
1047 call cublas_end(accel%cublas_handle)
1048 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1049 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1050 end if
1051#endif
1052
1053#ifdef HAVE_OPENCL
1054 call clreleasecommandqueue(accel%command_queue, ierr)
1056 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1057 call clreleasecontext(accel%context%cl_context, cl_status)
1058#endif
1059
1060 if (buffer_alloc_count /= 0) then
1061 call messages_write('Accel:')
1062 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1063 call messages_write(' in ')
1064 call messages_write(buffer_alloc_count)
1065 call messages_write(' buffers were not deallocated.')
1066 call messages_fatal()
1067 end if
1068
1069 end if
1070
1071 pop_sub(accel_end)
1072 end subroutine accel_end
1073
1074 ! ------------------------------------------
1075
1076 integer(int64) function accel_padded_size_i8(nn) result(psize)
1077 integer(int64), intent(in) :: nn
1078
1079 integer(int64) :: modnn, bsize
1080
1081 psize = nn
1082
1083 if (accel_is_enabled()) then
1084
1085 bsize = accel_max_workgroup_size()
1086
1087 psize = nn
1088 modnn = mod(nn, bsize)
1089 if (modnn /= 0) psize = psize + bsize - modnn
1090
1091 end if
1092
1093 end function accel_padded_size_i8
1094
1095 ! ------------------------------------------
1096
1097 integer(int32) function accel_padded_size_i4(nn) result(psize)
1098 integer(int32), intent(in) :: nn
1099
1100 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1101
1102 end function accel_padded_size_i4
1103
1104 ! ------------------------------------------
1105
1106 subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
1107 type(accel_mem_t), intent(inout) :: this
1108 integer, intent(in) :: flags
1109 type(type_t), intent(in) :: type
1110 integer, intent(in) :: size
1111 logical, optional, intent(in) :: set_zero
1112
1113 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero)
1114 end subroutine accel_create_buffer_4
1115
1116 ! ------------------------------------------
1117
1118 subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
1119 type(accel_mem_t), intent(inout) :: this
1120 integer, intent(in) :: flags
1121 type(type_t), intent(in) :: type
1122 integer(int64), intent(in) :: size
1123 logical, optional, intent(in) :: set_zero
1124
1125 integer(int64) :: fsize
1126 logical :: found
1127#ifdef HAVE_OPENCL
1128 integer :: ierr
1129#endif
1130
1131 push_sub(accel_create_buffer_8)
1132
1133 this%type = type
1134 this%size = size
1135 this%flags = flags
1136 fsize = int(size, int64)*types_get_size(type)
1137 this%allocated = .true.
1138
1139 if (fsize > 0) then
1140
1141 call alloc_cache_get(memcache, fsize, found, this%mem)
1142
1143 if (.not. found) then
1144#ifdef HAVE_OPENCL
1145 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1146 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1147#endif
1148#ifdef HAVE_CUDA
1149 call cuda_mem_alloc(this%mem, fsize)
1150#endif
1151 end if
1152
1153 buffer_alloc_count = buffer_alloc_count + 1
1154 allocated_mem = allocated_mem + fsize
1155
1156 end if
1157
1158 if(optional_default(set_zero, accel%initialize_buffers)) then
1159 call accel_set_buffer_to_zero_i8(this, type, size)
1160 endif
1161
1162 pop_sub(accel_create_buffer_8)
1163 end subroutine accel_create_buffer_8
1164
1165 ! ------------------------------------------
1166
1167 subroutine accel_release_buffer(this)
1168 type(accel_mem_t), intent(inout) :: this
1169
1170#ifdef HAVE_OPENCL
1171 integer :: ierr
1172#endif
1173 logical :: put
1174 integer(int64) :: fsize
1176 push_sub(accel_release_buffer)
1177
1178 if (this%size > 0) then
1179
1180 fsize = int(this%size, int64)*types_get_size(this%type)
1181
1182 call alloc_cache_put(memcache, fsize, this%mem, put)
1183
1184 if (.not. put) then
1185#ifdef HAVE_OPENCL
1186 call clreleasememobject(this%mem, ierr)
1187 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1188#endif
1189#ifdef HAVE_CUDA
1190 call cuda_mem_free(this%mem)
1191#endif
1192 end if
1193
1194 buffer_alloc_count = buffer_alloc_count - 1
1195 allocated_mem = allocated_mem + fsize
1197 end if
1198
1199 this%size = 0
1200 this%flags = 0
1201
1202 this%allocated = .false.
1203
1204 pop_sub(accel_release_buffer)
1205 end subroutine accel_release_buffer
1206
1207 ! ------------------------------------------
1208
1209 logical pure function accel_buffer_is_allocated(this) result(allocated)
1210 type(accel_mem_t), intent(in) :: this
1211
1212 allocated = this%allocated
1213 end function accel_buffer_is_allocated
1214
1215 ! -----------------------------------------
1216
1217 subroutine accel_finish()
1218#ifdef HAVE_OPENCL
1219 integer :: ierr
1220#endif
1221
1222 ! no push_sub, called too frequently
1223
1224 if (accel_is_enabled()) then
1225#ifdef HAVE_OPENCL
1226 call clfinish(accel%command_queue, ierr)
1227 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1228#endif
1229#ifdef HAVE_CUDA
1231#endif
1232 end if
1233 end subroutine accel_finish
1234
1235 ! ------------------------------------------
1236
1237 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1238 type(accel_kernel_t), intent(inout) :: kernel
1239 integer, intent(in) :: narg
1240 type(accel_mem_t), intent(in) :: buffer
1241
1242#ifdef HAVE_OPENCL
1243 integer :: ierr
1244#endif
1246 assert(accel_buffer_is_allocated(buffer))
1247
1248 ! no push_sub, called too frequently
1249#ifdef HAVE_OPENCL
1250 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1251 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1252#endif
1253
1254#ifdef HAVE_CUDA
1255 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1256#endif
1257
1258 end subroutine accel_set_kernel_arg_buffer
1259
1260 ! ------------------------------------------
1261
1262 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1263 type(accel_kernel_t), intent(inout) :: kernel
1264 integer, intent(in) :: narg
1265 type(type_t), intent(in) :: type
1266 integer, intent(in) :: size
1267
1268#ifdef HAVE_OPENCL
1269 integer :: ierr
1270#endif
1271 integer(int64) :: size_in_bytes
1272
1274
1275
1276 size_in_bytes = int(size, int64)*types_get_size(type)
1277
1278 if (size_in_bytes > accel%local_memory_size) then
1279 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1280 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1281 call messages_fatal(2)
1282 else if (size_in_bytes <= 0) then
1283 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1284 call messages_fatal(1)
1285 end if
1286
1287#ifdef HAVE_CUDA
1288 kernel%cuda_shared_mem = size_in_bytes
1289#endif
1290
1291#ifdef HAVE_OPENCL
1292 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1293 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1294#endif
1297 end subroutine accel_set_kernel_arg_local
1298
1299 ! ------------------------------------------
1300
1301 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1302 type(accel_kernel_t), intent(inout) :: kernel
1303 integer(int64), intent(in) :: globalsizes(:)
1304 integer(int64), intent(in) :: localsizes(:)
1305
1306 integer :: dim
1307#ifdef HAVE_OPENCL
1308 integer :: ierr
1309#endif
1310 integer(int64) :: gsizes(1:3)
1311 integer(int64) :: lsizes(1:3)
1312
1313 ! no push_sub, called too frequently
1314
1315 ! cuda needs all dimensions
1316 gsizes = 1
1317 lsizes = 1
1318
1319 dim = ubound(globalsizes, dim=1)
1320
1321 assert(dim == ubound(localsizes, dim=1))
1322
1323 ! if one size is zero, there is nothing to do
1324 if (any(globalsizes == 0)) return
1325
1326 assert(all(localsizes > 0))
1327 assert(all(localsizes <= accel_max_workgroup_size()))
1328 assert(all(mod(globalsizes, localsizes) == 0))
1329
1330 gsizes(1:dim) = globalsizes(1:dim)
1331 lsizes(1:dim) = localsizes(1:dim)
1332
1333#ifdef HAVE_OPENCL
1334 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1335 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1336#endif
1337
1338#ifdef HAVE_CUDA
1339 ! Maximum dimension of a block
1340 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1341 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1342 message(2) = "The following conditions should be fulfilled:"
1343 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1344 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1345 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1346 message(6) = "This is an internal error, please contact the developers."
1347 call messages_fatal(6)
1348 end if
1349
1350
1351 ! Maximum number of threads per block
1352 if (product(lsizes) > accel_max_workgroup_size()) then
1353 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1354 message(2) = "The following condition should be fulfilled:"
1355 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1356 message(4) = "This is an internal error, please contact the developers."
1357 call messages_fatal(4)
1358 end if
1359
1360 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1361
1362 ! Maximum dimensions of the grid of thread block
1363 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1364 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1365 message(2) = "The following conditions should be fulfilled:"
1366 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1367 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1368 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1369 message(6) = "This is an internal error, please contact the developers."
1370 call messages_fatal(6)
1371 end if
1372
1373 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1374
1375 kernel%cuda_shared_mem = 0
1376#endif
1377
1378 end subroutine accel_kernel_run_8
1380 ! -----------------------------------------------
1381
1382 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1383 type(accel_kernel_t), intent(inout) :: kernel
1384 integer, intent(in) :: globalsizes(:)
1385 integer, intent(in) :: localsizes(:)
1386
1387 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1388
1389 end subroutine accel_kernel_run_4
1390
1391 ! -----------------------------------------------
1392
1393 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1394 max_workgroup_size = accel%max_workgroup_size
1395 end function accel_max_workgroup_size
1396
1397 ! -----------------------------------------------
1398
1399 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1400 type(accel_kernel_t), intent(inout) :: kernel
1401
1402#ifdef HAVE_OPENCL
1403 integer(int64) :: workgroup_size8
1404 integer :: ierr
1405#endif
1406#ifdef HAVE_CUDA
1407 integer :: max_workgroup_size
1408#endif
1409
1410 workgroup_size = 0
1411
1412#ifdef HAVE_OPENCL
1413 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1414 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1415 workgroup_size = workgroup_size8
1416#endif
1417
1418#ifdef HAVE_CUDA
1419 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1420 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1421 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1422 "than available on the device (", accel%max_workgroup_size, ")"
1423 call messages_info(1)
1424 end if
1425 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1426 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1427 workgroup_size = 256
1428 ! make sure we do not use more threads per block than available for this kernel
1429 workgroup_size = min(workgroup_size, max_workgroup_size)
1430#endif
1431
1432 end function accel_kernel_workgroup_size
1433
1434 ! -----------------------------------------------
1435
1436#ifdef HAVE_OPENCL
1437 subroutine opencl_build_program(prog, filename, flags)
1438 type(cl_program), intent(inout) :: prog
1439 character(len=*), intent(in) :: filename
1440 character(len=*), optional, intent(in) :: flags
1441
1442 character(len = 1000) :: string
1443 character(len = 256) :: share_string
1444 integer :: ierr, ierrlog, iunit, irec, newlen
1445
1446 push_sub(opencl_build_program)
1447
1448 string = '#include "'//trim(filename)//'"'
1449
1450 call messages_write("Building CL program '"//trim(filename)//"'.")
1451 call messages_info(debug_only=.true.)
1452
1453 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1454 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1455
1456 ! build the compilation flags
1457 string='-w'
1458 ! full optimization
1459 string=trim(string)//' -cl-denorms-are-zero'
1460 ! The following flag gives an error with the Xeon Phi
1461 ! string=trim(string)//' -cl-strict-aliasing'
1462 string=trim(string)//' -cl-mad-enable'
1463 string=trim(string)//' -cl-unsafe-math-optimizations'
1464 string=trim(string)//' -cl-finite-math-only'
1465 string=trim(string)//' -cl-fast-relaxed-math'
1466
1467 share_string='-I'//trim(conf%share)//'/opencl/'
1468
1469 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1470 string = trim(string)//' -DEXT_KHR_FP64'
1471 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1472 string = trim(string)//' -DEXT_AMD_FP64'
1473 else
1474 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1475 call messages_fatal()
1476 end if
1478 if (accel_use_shared_mem()) then
1479 string = trim(string)//' -DSHARED_MEM'
1480 end if
1481
1482 if (present(flags)) then
1483 string = trim(string)//' '//trim(flags)
1484 end if
1485
1486 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1487 call messages_write(' '//trim(share_string)//"'.")
1488 call messages_info(debug_only=.true.)
1489
1490 string = trim(string)//' '//trim(share_string)
1491
1492 call clbuildprogram(prog, trim(string), ierr)
1493
1494 if(ierr /= cl_success) then
1495 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1496 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1497
1498 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1499 newlen = scan(string, achar(010), back = .true.) - 1
1500 if (newlen >= 0) string = string(1:newlen)
1501
1502 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1503
1504 call opencl_print_error(ierr, "clBuildProgram")
1505 end if
1506
1507 pop_sub(opencl_build_program)
1508 end subroutine opencl_build_program
1509#endif
1510
1511 ! -----------------------------------------------
1512#ifdef HAVE_OPENCL
1513 subroutine opencl_release_program(prog)
1514 type(cl_program), intent(inout) :: prog
1516 integer :: ierr
1517
1518 push_sub(opencl_release_program)
1519
1520 call clreleaseprogram(prog, ierr)
1521 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1522
1523 pop_sub(opencl_release_program)
1524 end subroutine opencl_release_program
1525#endif
1526
1527 ! -----------------------------------------------
1528
1529#ifdef HAVE_OPENCL
1530 subroutine opencl_release_kernel(prog)
1531 type(cl_kernel), intent(inout) :: prog
1532
1533 integer :: ierr
1534
1535 push_sub(opencl_release_kernel)
1536
1537#ifdef HAVE_OPENCL
1538 call clreleasekernel(prog, ierr)
1539 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1540#endif
1541
1542 pop_sub(opencl_release_kernel)
1543 end subroutine opencl_release_kernel
1544#endif
1545
1546#ifdef HAVE_OPENCL
1547 ! -----------------------------------------------
1548 subroutine opencl_create_kernel(kernel, prog, name)
1549 type(cl_kernel), intent(inout) :: kernel
1550 type(cl_program), intent(inout) :: prog
1551 character(len=*), intent(in) :: name
1552
1553 integer :: ierr
1554
1555 push_sub(opencl_create_kernel)
1556 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1557
1558#ifdef HAVE_OPENCL
1559 kernel = clcreatekernel(prog, name, ierr)
1560 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1561#endif
1562
1563 call profiling_out("CL_BUILD_KERNEL")
1564 pop_sub(opencl_create_kernel)
1565 end subroutine opencl_create_kernel
1566#endif
1567
1568 ! ------------------------------------------------
1569#ifdef HAVE_OPENCL
1570 subroutine opencl_print_error(ierr, name)
1571 integer, intent(in) :: ierr
1572 character(len=*), intent(in) :: name
1573
1574 character(len=40) :: errcode
1575
1576 push_sub(opencl_print_error)
1577
1578 select case (ierr)
1579 case (cl_success); errcode = 'CL_SUCCESS '
1580 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1581 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1582 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1583 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1584 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1585 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1586 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1587 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1588 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1589 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1590 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1591 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1592 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1593 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1594 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1595 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1596 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1597 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1598 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1599 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1600 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1601 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1602 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1603 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1604 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1605 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1606 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1607 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1608 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1609 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1610 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1611 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1612 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1613 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1614 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1615 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1616 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1617 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1618 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1619 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1620 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1621 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1622 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1623 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1624 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1625 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1626 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1627 case default
1628 write(errcode, '(i10)') ierr
1629 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1630 end select
1631
1632 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1633 call messages_fatal(1)
1634
1635 pop_sub(opencl_print_error)
1636 end subroutine opencl_print_error
1637#endif
1638
1639 ! ----------------------------------------------------
1640
1641 subroutine clblas_print_error(ierr, name)
1642 integer, intent(in) :: ierr
1643 character(len=*), intent(in) :: name
1644
1645 character(len=40) :: errcode
1646
1647 push_sub(clblas_print_error)
1648#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1649 select case (ierr)
1650 case (clblassuccess); errcode = 'clblasSuccess'
1651 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1652 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1653 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1654 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1655 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1656 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1657 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1658 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1659 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1660 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1661 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1662 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1663 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1664 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1665 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1666 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1667 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1668 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1669 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1670 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1671 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1672 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1673 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1674 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1675 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1676 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1677 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1678 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1679 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1680#ifdef HAVE_CLBLAST
1681 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1682 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1683 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1684 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1685 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1686 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1687 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1688 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1689 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1690 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1691 case (clblastunknownerror); errcode = 'clblastUnknownError'
1692 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1693#endif
1694
1695 case default
1696 write(errcode, '(i10)') ierr
1697 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1698 end select
1699#endif
1700
1701 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1702 call messages_fatal(1)
1703
1704 pop_sub(clblas_print_error)
1705 end subroutine clblas_print_error
1706
1707 ! ----------------------------------------------------
1708 subroutine clfft_print_error(ierr, name)
1709 integer, intent(in) :: ierr
1710 character(len=*), intent(in) :: name
1711
1712 character(len=40) :: errcode
1713
1714 push_sub(clfft_print_error)
1715#ifdef HAVE_CLFFT
1716 select case (ierr)
1717 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1718 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1719 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1720 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1721 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1722 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1723 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1724 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1725 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1726 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1727 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1728 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1729 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1730 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1731 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1732 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1733 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1734 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1735 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1736 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1737 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1738 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1739 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1740 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1741 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1742 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1743 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1744 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1745 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1746 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1747 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1748 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1749 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1750 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1751 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1752 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1753 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1754 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1755 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1756 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1757 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1758 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1759 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1760 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1761 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1762 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1763 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1764 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1765 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1766 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1767 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1768 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1769 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1770 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1771 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1772 case default
1773 write(errcode, '(i10)') ierr
1774 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1775 end select
1776#endif
1777
1778 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1779 call messages_fatal(1)
1780
1781 pop_sub(clfft_print_error)
1782 end subroutine clfft_print_error
1783
1784 ! ----------------------------------------------------
1785
1786#ifdef HAVE_OPENCL
1787 logical function f90_cl_device_has_extension(device, extension) result(has)
1788 type(cl_device_id), intent(inout) :: device
1789 character(len=*), intent(in) :: extension
1790
1791 integer :: cl_status
1792 character(len=2048) :: all_extensions
1793
1794#ifdef HAVE_OPENCL
1795 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1796#endif
1797
1798 has = index(all_extensions, extension) /= 0
1799
1800 end function f90_cl_device_has_extension
1801#endif
1802
1803 ! ---------------------------------------------------------
1804
1805 integer(int64) pure function opencl_pad(size, blk) result(pad)
1806 integer(int64), intent(in) :: size
1807 integer, intent(in) :: blk
1808
1809 integer(int64) :: mm
1810
1811 mm = mod(size, blk)
1812 if (mm == 0) then
1813 pad = size
1814 else
1815 pad = size + blk - mm
1816 end if
1817 end function opencl_pad
1818
1819 ! ----------------------------------------------------
1820
1821 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1822 type(accel_mem_t), intent(inout) :: buffer
1823 type(type_t), intent(in) :: type
1824 integer(int64), intent(in) :: nval
1825 integer(int64), optional, intent(in) :: offset
1826 logical, optional, intent(in) :: async
1827
1828 integer :: bsize
1829 integer(int64) :: nval_real, offset_real
1830
1832
1833 if (nval > 0) then
1834
1835 nval_real = nval
1836 if (type == type_cmplx) nval_real = nval_real * 2
1837 if (present(offset)) then
1838 offset_real = offset
1839 if (type == type_cmplx) offset_real = offset_real * 2
1840 else
1841 offset_real = 0_int64
1842 end if
1843
1844 assert(nval_real > 0)
1845
1846 call accel_set_kernel_arg(set_zero, 0, nval_real)
1847 call accel_set_kernel_arg(set_zero, 1, offset_real)
1848 call accel_set_kernel_arg(set_zero, 2, buffer)
1849
1851
1852 call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ int(bsize, int64) /))
1853
1854 if(.not. optional_default(async, .false.)) call accel_finish()
1855
1856 end if
1857
1859 end subroutine accel_set_buffer_to_zero_i8
1860
1861 ! ----------------------------------------------------
1862
1863 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1864 type(accel_mem_t), intent(inout) :: buffer
1865 type(type_t), intent(in) :: type
1866 integer(int32), intent(in) :: nval
1867 integer(int32), optional, intent(in) :: offset
1868 logical, optional, intent(in) :: async
1869
1871
1872 if (present(offset)) then
1873 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1874 else
1875 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1876 end if
1877
1879 end subroutine accel_set_buffer_to_zero_i4
1880
1881 ! ----------------------------------------------------
1882
1884 integer :: itime
1885 integer, parameter :: times = 10
1886 integer :: size
1887 real(real64) :: time, stime
1888 real(real64) :: read_bw, write_bw
1889 type(accel_mem_t) :: buff
1890 real(real64), allocatable :: data(:)
1891
1892 call messages_new_line()
1893 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1894 call messages_new_line()
1895 call messages_info()
1896
1897 call messages_write(' Buffer size Read bw Write bw')
1898 call messages_new_line()
1899 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1900 call messages_info()
1901
1902 size = 15000
1903 do
1904 safe_allocate(data(1:size))
1905 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1906
1907 stime = loct_clock()
1908 do itime = 1, times
1909 call accel_write_buffer(buff, size, data)
1910 call accel_finish()
1911 end do
1912 time = (loct_clock() - stime)/real(times, real64)
1913
1914 write_bw = real(size, real64) *8.0_real64/time
1915
1916 stime = loct_clock()
1917 do itime = 1, times
1918 call accel_read_buffer(buff, size, data)
1919 end do
1920 call accel_finish()
1921
1922 time = (loct_clock() - stime)/real(times, real64)
1923 read_bw = real(size, real64) *8.0_real64/time
1924
1925 call messages_write(size*8.0_real64/1024.0_real64**2)
1926 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1927 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1928 call messages_info()
1929
1930 call accel_release_buffer(buff)
1931
1932 safe_deallocate_a(data)
1933
1934 size = int(size*2.0)
1935
1936 if (size > 50000000) exit
1937 end do
1938 end subroutine opencl_check_bandwidth
1939
1940 ! ----------------------------------------------------
1942 logical pure function accel_use_shared_mem() result(use_shared_mem)
1943
1944 use_shared_mem = accel%shared_mem
1945
1946 end function accel_use_shared_mem
1947
1948 !------------------------------------------------------------
1949
1950 subroutine accel_kernel_global_init()
1951
1952 push_sub(accel_kernel_global_init)
1953
1954 nullify(head)
1955
1956 call cuda_module_map_init(accel%module_map)
1957
1959 end subroutine accel_kernel_global_init
1960
1961 !------------------------------------------------------------
1962
1963 subroutine accel_kernel_global_end()
1964 type(accel_kernel_t), pointer :: next_head
1965
1966 push_sub(accel_kernel_global_end)
1967
1968 do
1969 if (.not. associated(head)) exit
1970 next_head => head%next
1972 head => next_head
1973 end do
1974
1975 if (accel_is_enabled()) then
1976 call cuda_module_map_end(accel%module_map)
1977 end if
1978
1980 end subroutine accel_kernel_global_end
1981
1982 !------------------------------------------------------------
1983
1984 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1985 type(accel_kernel_t), intent(inout) :: this
1986 character(len=*), intent(in) :: file_name
1987 character(len=*), intent(in) :: kernel_name
1988 character(len=*), optional, intent(in) :: flags
1989
1990#ifdef HAVE_OPENCL
1991 type(cl_program) :: prog
1992#endif
1993#ifdef HAVE_CUDA
1994 character(len=1000) :: all_flags
1995#endif
1996
1997 push_sub(accel_kernel_build)
1998
1999 call profiling_in("ACCEL_COMPILE", exclude = .true.)
2000
2001#ifdef HAVE_CUDA
2002 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
2003
2004 if (accel_use_shared_mem()) then
2005 all_flags = trim(all_flags)//' -DSHARED_MEM'
2006 end if
2007
2008 if (present(flags)) then
2009 all_flags = trim(all_flags)//' '//trim(flags)
2010 end if
2011
2012 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
2013
2014 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2015 call cuda_alloc_arg_array(this%arguments)
2016
2017 this%cuda_shared_mem = 0
2018#endif
2019
2020#ifdef HAVE_OPENCL
2021 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2022 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2023 call opencl_release_program(prog)
2024#endif
2025
2026 this%initialized = .true.
2027 this%kernel_name = trim(kernel_name)
2029 call profiling_out("ACCEL_COMPILE")
2030
2031 pop_sub(accel_kernel_build)
2032 end subroutine accel_kernel_build
2033
2034 !------------------------------------------------------------
2035
2036 subroutine accel_kernel_end(this)
2037 type(accel_kernel_t), intent(inout) :: this
2038#ifdef HAVE_OPENCL
2039 integer :: ierr
2040#endif
2042 push_sub(accel_kernel_end)
2043
2044#ifdef HAVE_CUDA
2045 call cuda_free_arg_array(this%arguments)
2046 call cuda_release_kernel(this%cuda_kernel)
2047 ! modules are not released here, since they are not associated to a kernel
2048#endif
2049
2050#ifdef HAVE_OPENCL
2051 call clreleasekernel(this%kernel, ierr)
2052 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2053#endif
2054 this%initialized = .false.
2055
2056 pop_sub(accel_kernel_end)
2057 end subroutine accel_kernel_end
2058
2059 !------------------------------------------------------------
2060
2061 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2062 type(accel_kernel_t), target, intent(inout) :: this
2063 character(len=*), intent(in) :: file_name
2064 character(len=*), intent(in) :: kernel_name
2065 character(len=*), optional, intent(in) :: flags
2066
2067 push_sub(accel_kernel_start_call)
2068
2069 if (.not. this%initialized) then
2070 call accel_kernel_build(this, file_name, kernel_name, flags)
2071 this%next => head
2072 head => this
2073 end if
2074
2076 end subroutine accel_kernel_start_call
2077
2078 !--------------------------------------------------------------
2079
2080 integer(int64) pure function accel_global_memory_size() result(size)
2081
2082 size = accel%global_memory_size
2083
2084 end function accel_global_memory_size
2085
2086 !--------------------------------------------------------------
2087
2088 integer(int64) pure function accel_local_memory_size() result(size)
2089
2090 size = accel%local_memory_size
2091
2092 end function accel_local_memory_size
2093
2094 !--------------------------------------------------------------
2095
2096 integer pure function accel_max_size_per_dim(dim) result(size)
2097 integer, intent(in) :: dim
2098
2099 size = 0
2100#ifdef HAVE_OPENCL
2101 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2102 ! value of zero when multiplied by 2048 and converted to integer 4.
2103 if (dim == 1) size = 2**30
2104#endif
2105#ifdef HAVE_CUDA
2106 size = 32768
2107 if (dim == 1) size = 2**30
2108#endif
2109 end function accel_max_size_per_dim
2110
2111 ! ------------------------------------------------------
2112
2113 subroutine accel_set_stream(stream_number)
2114 integer, intent(in) :: stream_number
2115
2116 push_sub(accel_set_stream)
2117
2118 if (accel_is_enabled()) then
2119#ifdef HAVE_CUDA
2120 call cuda_set_stream(accel%cuda_stream, stream_number)
2121 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2122#endif
2123 end if
2124
2125 pop_sub(accel_set_stream)
2126 end subroutine accel_set_stream
2127
2128 ! ------------------------------------------------------
2129
2130 subroutine accel_get_stream(stream_number)
2131 integer, intent(inout) :: stream_number
2132
2133 push_sub(accel_get_stream)
2134
2135 if (accel_is_enabled()) then
2136#ifdef HAVE_CUDA
2137 call cuda_get_stream(stream_number)
2138#endif
2139 end if
2140
2141 pop_sub(accel_get_stream)
2142 end subroutine accel_get_stream
2143
2144 ! ------------------------------------------------------
2145
2148
2149 if (accel_is_enabled()) then
2150#ifdef HAVE_CUDA
2151 call cuda_synchronize_all_streams()
2152#endif
2153 end if
2154
2156 end subroutine accel_synchronize_all_streams
2157
2158 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2159 type(c_ptr), intent(in) :: buffer
2160 integer(int64), intent(in) :: offset
2161 type(c_ptr) :: buffer_offset
2162
2164#ifdef HAVE_CUDA
2165 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2166#else
2167 ! this is needed to make the compiler happy for non-GPU compilations
2168 buffer_offset = buffer
2169#endif
2172
2173 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2174 type(c_ptr), intent(in) :: buffer
2175 integer(int64), intent(in) :: offset
2176 type(c_ptr) :: buffer_offset
2177
2179#ifdef HAVE_CUDA
2180 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2181#else
2182 ! this is needed to make the compiler happy for non-GPU compilations
2183 buffer_offset = buffer
2184#endif
2187
2188 subroutine accel_clean_pointer(buffer)
2189 type(c_ptr), intent(in) :: buffer
2190
2192#ifdef HAVE_CUDA
2193 call cuda_clean_pointer(buffer)
2194#endif
2195 pop_sub(accel_clean_pointer)
2196 end subroutine accel_clean_pointer
2197
2201 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2202 integer(int64), intent(in) :: size
2203 integer(int64), intent(out) :: grid_size
2204 integer(int64), intent(out) :: thread_block_size
2205
2206 push_sub(accel_get_unfolded_size)
2207#ifdef __HIP_PLATFORM_AMD__
2208 ! not benefitial for AMD chips
2209 grid_size = size
2210 thread_block_size = size
2211#else
2212 grid_size = size * accel%warp_size
2213 thread_block_size = accel%warp_size
2214#endif
2216 end subroutine accel_get_unfolded_size
2217
2218#include "undef.F90"
2219#include "real.F90"
2220#include "accel_inc.F90"
2221
2222#include "undef.F90"
2223#include "complex.F90"
2224#include "accel_inc.F90"
2225
2226#include "undef.F90"
2227#include "integer.F90"
2228#include "accel_inc.F90"
2229
2230#include "undef.F90"
2231#include "integer8.F90"
2232#include "accel_inc.F90"
2233
2234end module accel_oct_m
2235
2236!! Local Variables:
2237!! mode: f90
2238!! coding: utf-8
2239!! End:
subroutine select_device(idevice)
Definition: accel.F90:860
subroutine device_info()
Definition: accel.F90:889
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4690
integer, parameter opencl_accelerator
Definition: accel.F90:375
type(accel_kernel_t), target, save, public kernel_density_real
Definition: accel.F90:285
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3457
integer, parameter opencl_default
Definition: accel.F90:375
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:274
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:293
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:2267
subroutine accel_kernel_global_end()
Definition: accel.F90:2042
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:2280
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:410
subroutine iaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3849
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2761
subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
Definition: accel.F90:1197
subroutine laccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4435
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:2021
subroutine laccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4533
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3366
subroutine daccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2481
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:297
subroutine laccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4256
subroutine zaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3143
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
Definition: accel.F90:275
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4577
subroutine daccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2378
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2970
type(accel_kernel_t), target, save, public kernel_ghost_reorder
Definition: accel.F90:284
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4084
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4601
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:294
integer, parameter cl_plat_nvidia
Definition: accel.F90:382
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:2140
subroutine iaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3729
subroutine daccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2639
integer, parameter cl_plat_ati
Definition: accel.F90:382
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:2209
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:2159
subroutine daccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2679
subroutine iaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3810
subroutine daccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2538
subroutine zaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3045
subroutine zaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3105
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:296
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3341
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2874
subroutine laccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4476
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1341
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2852
subroutine zaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3086
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:1155
subroutine iaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3908
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4039
subroutine, public accel_finish()
Definition: accel.F90:1296
subroutine opencl_check_bandwidth()
Definition: accel.F90:1962
subroutine accel_kernel_global_init()
Definition: accel.F90:2029
subroutine daccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2421
type(accel_kernel_t), target, save, public kernel_daxpy
Definition: accel.F90:277
subroutine laccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4373
subroutine opencl_release_program(prog)
Definition: accel.F90:1592
type(accel_kernel_t), save set_zero
Definition: accel.F90:304
subroutine zaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3244
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3971
subroutine laccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4276
type(accel_kernel_t), target, save, public zzmul
Definition: accel.F90:300
integer, parameter cl_plat_invalid
Definition: accel.F90:382
type(accel_kernel_t), target, save, public kernel_density_spinors
Definition: accel.F90:287
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4552
subroutine daccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2698
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4181
subroutine daccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2440
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2785
integer(int64) pure function opencl_pad(size, blk)
Definition: accel.F90:1884
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1900
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3434
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1288
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine zaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3265
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1787
subroutine accel_kernel_end(this)
Definition: accel.F90:2115
subroutine iaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3691
integer, parameter opencl_gpu
Definition: accel.F90:375
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:295
subroutine iaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3829
subroutine opencl_release_kernel(prog)
Definition: accel.F90:1609
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3390
subroutine zaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2983
subroutine zaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3284
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2237
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3575
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3995
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:2175
type(accel_kernel_t), target, save, public dzmul
Definition: accel.F90:299
subroutine iaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3748
subroutine laccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4237
subroutine daccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2600
subroutine, public accel_release_buffer(this)
Definition: accel.F90:1246
type(accel_kernel_t), target, save, public kernel_phase_spiral
Definition: accel.F90:289
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4668
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4017
subroutine zaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3026
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2736
subroutine zaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3303
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1380
subroutine laccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4316
subroutine laccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4455
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
Definition: accel.F90:276
subroutine opencl_build_program(prog, filename, flags)
Definition: accel.F90:1516
subroutine laccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4335
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:2063
subroutine iaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3650
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:420
subroutine, public accel_end(namespace)
Definition: accel.F90:1056
subroutine iaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3767
subroutine opencl_create_kernel(kernel, prog, name)
Definition: accel.F90:1627
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:2225
subroutine iaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3889
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:2192
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:283
type(accel_kernel_t), target, save, public kernel_phase
Definition: accel.F90:288
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4645
subroutine zaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3322
subroutine laccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4514
type(accel_kernel_t), target, save, public kernel_zaxpy
Definition: accel.F90:278
subroutine laccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4495
integer, parameter cl_plat_amd
Definition: accel.F90:382
subroutine zaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3224
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:1176
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1942
subroutine daccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2519
subroutine iaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3710
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:298
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4062
subroutine daccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2500
subroutine iaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3588
pure logical function, public accel_is_enabled()
Definition: accel.F90:400
subroutine zaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3162
integer, parameter cl_plat_intel
Definition: accel.F90:382
integer, parameter, public accel_mem_write_only
Definition: accel.F90:183
subroutine daccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2717
subroutine daccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2660
subroutine laccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4297
subroutine zaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3205
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:273
logical function f90_cl_device_has_extension(device, extension)
Definition: accel.F90:1866
subroutine opencl_print_error(ierr, name)
Definition: accel.F90:1649
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2829
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1461
subroutine zaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3124
subroutine iaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3631
subroutine laccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4194
type(accel_kernel_t), target, save, public kernel_density_complex
Definition: accel.F90:286
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3412
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2252
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:2365
subroutine daccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2557
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1478
subroutine laccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4416
integer, parameter opencl_cpu
Definition: accel.F90:375
subroutine zaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3065
integer function get_platform_id(platform_name)
Definition: accel.F90:1044
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1720
type(accel_t), public accel
Definition: accel.F90:270
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4623
subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
Definition: accel.F90:1185
subroutine laccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4354
subroutine daccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2619
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3946
subroutine daccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2460
integer, public cl_status
Definition: accel.F90:390
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3479
subroutine iaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3870
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:282
subroutine iaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3670
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:2167
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1316
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1472
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2807
type(accel_kernel_t), pointer head
Definition: accel.F90:394
subroutine iaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3927
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
integer, parameter, public clblassuccess
Definition: clblas.F90:193
real(real64), parameter, public m_zero
Definition: global.F90:187
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:930
character(len=512), private msg
Definition: messages.F90:165
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:543
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1057
subroutine, public messages_new_line()
Definition: messages.F90:1146
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:420
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:624
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
Definition: mpi.F90:430
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:623
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:552
integer pure function, public types_get_size(this)
Definition: types.F90:152
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)