Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
47 use mpi_oct_m
49 use types_oct_m
50 use parser_oct_m
53
54 implicit none
55
56 private
57
58 public :: &
63 accel_t, &
66 accel_init, &
67 accel_end, &
96
97#ifdef HAVE_OPENCL
98 integer, public, parameter :: &
99 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
100 accel_mem_read_write = cl_mem_read_write, &
101 accel_mem_write_only = cl_mem_write_only
102#else
103 integer, public, parameter :: &
104 ACCEL_MEM_READ_ONLY = 0, &
107#endif
108
109 type accel_context_t
110 ! Components are public by default
111#ifdef HAVE_OPENCL
112 type(cl_context) :: cl_context
113#elif defined(HAVE_CUDA)
114 type(c_ptr) :: cuda_context
115#else
116 integer :: dummy
117#endif
118 end type accel_context_t
119
120 type accel_device_t
121 ! Components are public by default
122#ifdef HAVE_OPENCL
123 type(cl_device_id) :: cl_device
124#elif defined(HAVE_CUDA)
125 type(c_ptr) :: cuda_device
126#else
127 integer :: dummy
128#endif
129 end type accel_device_t
130
131 type accel_t
132 ! Components are public by default
133 type(accel_context_t) :: context
134 type(accel_device_t) :: device
135#ifdef HAVE_OPENCL
136 type(cl_command_queue) :: command_queue
137#endif
138 type(c_ptr) :: cublas_handle
139 type(c_ptr) :: cuda_stream
140 type(c_ptr) :: module_map
141 integer :: max_workgroup_size
142 integer(int64) :: local_memory_size
143 integer(int64) :: global_memory_size
144 logical :: enabled
145 logical :: allow_CPU_only
146 logical :: shared_mem
147 logical :: cuda_mpi
148 integer :: warp_size
149 logical :: initialize_buffers
150 character(len=32) :: debug_flag
151 integer(int64) :: max_block_dim(3)
152 integer(int64) :: max_grid_dim(3)
153 end type accel_t
154
155 type accel_mem_t
156 ! Components are public by default
157#ifdef HAVE_OPENCL
158 type(cl_mem) :: mem
159#else
160 type(c_ptr) :: mem
161#endif
162 integer(c_size_t) :: size = 0
163 type(type_t) :: type
164 integer :: flags = 0
165 logical :: allocated = .false.
166 end type accel_mem_t
167
168 type accel_kernel_t
169 ! Components are public by default
170#ifdef HAVE_OPENCL
171 type(cl_kernel) :: kernel
172#endif
173#ifdef HAVE_CUDA
174 type(c_ptr) :: cuda_kernel
175 type(c_ptr) :: cuda_module
176 type(c_ptr) :: arguments
177#endif
178 integer(int64) :: cuda_shared_mem
179 logical :: initialized = .false.
180 type(accel_kernel_t), pointer :: next
181 integer :: arg_count
182 character(len=128) :: kernel_name
184
185 type(accel_t), public :: accel
186
187 ! the kernels
188 type(accel_kernel_t), public, target, save :: kernel_vpsi
189 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
190 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
191 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
192 type(accel_kernel_t), public, target, save :: kernel_daxpy
193 type(accel_kernel_t), public, target, save :: kernel_zaxpy
194 type(accel_kernel_t), public, target, save :: kernel_copy
195 type(accel_kernel_t), public, target, save :: dpack
196 type(accel_kernel_t), public, target, save :: zpack
197 type(accel_kernel_t), public, target, save :: dunpack
198 type(accel_kernel_t), public, target, save :: zunpack
199 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
200 type(accel_kernel_t), public, target, save :: kernel_density_real
201 type(accel_kernel_t), public, target, save :: kernel_density_complex
202 type(accel_kernel_t), public, target, save :: kernel_density_spinors
203 type(accel_kernel_t), public, target, save :: kernel_phase
204 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
205 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
206 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
207 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
208 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
209 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
210 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
211 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
212 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
213 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
214 type(accel_kernel_t), public, target, save :: dzmul
215 type(accel_kernel_t), public, target, save :: zzmul
216 type(accel_kernel_t), public, target, save :: set_one
217
218 ! kernels used locally
219 type(accel_kernel_t), target, save :: set_zero
220 type(accel_kernel_t), target, save :: set_zero_int
222 interface accel_padded_size
224 end interface accel_padded_size
232 end interface accel_kernel_run
238 interface accel_write_buffer
252 end interface accel_write_buffer
254 interface accel_read_buffer
263 end interface accel_read_buffer
266 module procedure &
291 integer, parameter :: &
298 integer, parameter :: &
302 cl_plat_ati = -4, &
303 cl_plat_intel = -5
305 ! a "convenience" public variable
306 integer, public :: cl_status
308 integer :: buffer_alloc_count
309 integer(int64) :: allocated_mem
310 type(accel_kernel_t), pointer :: head
311 type(alloc_cache_t) :: memcache
312
313contains
314
315 pure logical function accel_is_enabled() result(enabled)
316#ifdef HAVE_ACCEL
317 enabled = accel%enabled
318#else
319 enabled = .false.
320#endif
321 end function accel_is_enabled
322
323 ! ------------------------------------------
324
325 pure logical function accel_allow_cpu_only() result(allow)
326#ifdef HAVE_ACCEL
327 allow = accel%allow_CPU_only
328#else
329 allow = .true.
330#endif
331 end function accel_allow_cpu_only
332
333 ! ------------------------------------------
334
335 subroutine accel_init(base_grp, namespace)
336 type(mpi_grp_t), intent(inout) :: base_grp
337 type(namespace_t), intent(in) :: namespace
338
339 logical :: disable, default, run_benchmark
340 integer :: idevice, iplatform
341#ifdef HAVE_OPENCL
342 integer :: device_type
343 integer :: cl_status, idev
344 integer :: ndevices, ret_devices, nplatforms, iplat
345 character(len=256) :: device_name
346 type(cl_platform_id) :: platform_id
347 type(cl_program) :: prog
348 type(cl_platform_id), allocatable :: allplatforms(:)
349 type(cl_device_id), allocatable :: alldevices(:)
350#endif
351#ifdef HAVE_CUDA
352 integer :: dim
353#ifdef HAVE_MPI
354 character(len=256) :: sys_name
355#endif
356#endif
357
358 push_sub(accel_init)
359
360 buffer_alloc_count = 0
361
362 !%Variable DisableAccel
363 !%Type logical
364 !%Default yes
365 !%Section Execution::Accel
366 !%Description
367 !% If Octopus was compiled with OpenCL or CUDA support, it will
368 !% try to initialize and use an accelerator device. By setting this
369 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
370 !%End
371 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
372#ifdef HAVE_ACCEL
373 default = .false.
374#else
375 default = .true.
376#endif
377 call parse_variable(namespace, 'DisableAccel', default, disable)
378 accel%enabled = .not. disable
379
380#ifndef HAVE_ACCEL
381 if (accel%enabled) then
382 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
384 end if
385#endif
386
387 if (.not. accel_is_enabled()) then
388 pop_sub(accel_init)
389 return
390 end if
392 !%Variable AccelPlatform
393 !%Type integer
394 !%Default 0
395 !%Section Execution::Accel
396 !%Description
397 !% This variable selects the OpenCL platform that Octopus will
398 !% use. You can give an explicit platform number or use one of
399 !% the options that select a particular vendor
400 !% implementation. Platform 0 is used by default.
401 !%
402 !% This variable has no effect for CUDA.
403 !%Option amd -2
404 !% Use the AMD OpenCL platform.
405 !%Option nvidia -3
406 !% Use the Nvidia OpenCL platform.
407 !%Option ati -4
408 !% Use the ATI (old AMD) OpenCL platform.
409 !%Option intel -5
410 !% Use the Intel OpenCL platform.
411 !%End
412 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
413
414 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
415
416 !%Variable AccelDevice
417 !%Type integer
418 !%Default gpu
419 !%Section Execution::Accel
420 !%Description
421 !% This variable selects the OpenCL or CUDA accelerator device
422 !% that Octopus will use. You can specify one of the options below
423 !% or a numerical id to select a specific device.
424 !%
425 !% Values >= 0 select the device to be used. In case of MPI enabled runs
426 !% devices are distributed in a round robin fashion, starting at this value.
427 !%Option gpu -1
428 !% If available, Octopus will use a GPU.
429 !%Option cpu -2
430 !% If available, Octopus will use a CPU (only for OpenCL).
431 !%Option accelerator -3
432 !% If available, Octopus will use an accelerator (only for OpenCL).
433 !%Option accel_default -4
434 !% Octopus will use the default device specified by the implementation.
435 !% implementation.
436 !%End
437 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
438
439 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
440
441 if (idevice < opencl_default) then
442 call messages_write('Invalid AccelDevice')
443 call messages_fatal()
444 end if
445
446 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
447
448#ifdef HAVE_CUDA
449 if (idevice<0) idevice = 0
450 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
451 idevice, base_grp%rank)
452#ifdef HAVE_MPI
453 call loct_sysname(sys_name)
454 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
455 " on ", trim(sys_name)
456 call messages_info(1, all_nodes = .true.)
457#endif
458
459 ! no shared mem support in our cuda interface (for the moment)
460 accel%shared_mem = .true.
461
462 call cublas_init(accel%cublas_handle, accel%cuda_stream)
463#endif
464
465#ifdef HAVE_OPENCL
466 call profiling_in('CL_INIT')
467
468 call clgetplatformids(nplatforms, cl_status)
469 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
470
471 safe_allocate(allplatforms(1:nplatforms))
472
473 call clgetplatformids(allplatforms, iplat, cl_status)
474 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
475
476 call messages_write('Info: Available CL platforms: ')
477 call messages_write(nplatforms)
478 call messages_info()
479
480 do iplat = 1, nplatforms
481
482 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
483
484 if (iplatform < 0) then
485 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
486 end if
487
488 if (iplatform == iplat - 1) then
489 call messages_write(' * Platform ')
490 else
491 call messages_write(' Platform ')
492 end if
493
494 call messages_write(iplat - 1)
495 call messages_write(' : '//device_name)
496 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
497 call messages_write(' ('//trim(device_name)//')')
498 call messages_info()
499 end do
500
501 call messages_info()
502
503 if (iplatform >= nplatforms .or. iplatform < 0) then
504 call messages_write('Requested CL platform does not exist')
505 if (iplatform > 0) then
506 call messages_write('(platform = ')
507 call messages_write(iplatform)
508 call messages_write(').')
509 end if
510 call messages_fatal()
511 end if
512
513 platform_id = allplatforms(iplatform + 1)
514
515 safe_deallocate_a(allplatforms)
516
517 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
518
519 call messages_write('Info: Available CL devices: ')
520 call messages_write(ndevices)
521 call messages_info()
522
523 safe_allocate(alldevices(1:ndevices))
524
525 ! list all devices
526
527 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
528
529 do idev = 1, ndevices
530 call messages_write(' Device ')
531 call messages_write(idev - 1)
532 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
533 call messages_write(' : '//device_name)
534 call messages_info()
535 end do
536
537 select case (idevice)
538 case (opencl_gpu)
539 device_type = cl_device_type_gpu
540 case (opencl_cpu)
541 device_type = cl_device_type_cpu
542 case (opencl_accelerator)
543 device_type = cl_device_type_accelerator
544 case (opencl_default)
545 device_type = cl_device_type_default
546 case default
547 device_type = cl_device_type_all
548 end select
549
550 ! now get a list of the selected type
551 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
552
553 if (ret_devices < 1) then
554 ! we didnt find a device of the selected type, we ask for the default device
555 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
556
557 if (ret_devices < 1) then
558 ! if this does not work, we ask for all devices
559 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
560 end if
561
562 if (ret_devices < 1) then
563 call messages_write('Cannot find an OpenCL device')
564 call messages_fatal()
565 end if
566 end if
567
568 ! the number of devices can be smaller
569 ndevices = ret_devices
570
571 if (idevice < 0) then
572 if (base_grp%size > 1) then
573 ! with MPI we have to select the device so multiple GPUs in one
574 ! node are correctly distributed
575 call select_device(idevice)
576 else
577 idevice = 0
578 end if
579 end if
580
581 if (idevice >= ndevices) then
582 call messages_write('Requested CL device does not exist (device = ')
583 call messages_write(idevice)
584 call messages_write(', platform = ')
585 call messages_write(iplatform)
586 call messages_write(').')
587 call messages_fatal()
588 end if
589
590 accel%device%cl_device = alldevices(idevice + 1)
591
592 ! create the context
593 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
594 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
595
596 safe_deallocate_a(alldevices)
597
598 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
599 cl_queue_profiling_enable, cl_status)
600 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
601
602 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
603
604 select case (device_type)
605 case (cl_device_type_gpu)
606 accel%shared_mem = .true.
607 case (cl_device_type_cpu, cl_device_type_accelerator)
608 accel%shared_mem = .false.
609 case default
610 accel%shared_mem = .false.
611 end select
612
613#ifdef HAVE_CLBLAS
614 call clblassetup(cl_status)
615 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
616#endif
617
618#ifdef HAVE_CLFFT
619 call clfftsetup(cl_status)
620 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
621#endif
622
623 call profiling_out('CL_INIT')
624#endif
625
626 ! Get some device information that we will need later
627
628 ! total memory
629#ifdef HAVE_OPENCL
630 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
631 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
632 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
633 accel%warp_size = 1
634#endif
635#ifdef HAVE_CUDA
636 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
637 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
638 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
639 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
640 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
641 accel%max_block_dim(1) = int(dim, int64)
642 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
643 accel%max_block_dim(2) = int(dim, int64)
644 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
645 accel%max_block_dim(3) = int(dim, int64)
646 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
647 accel%max_grid_dim(1) = int(dim, int64)
648 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
649 accel%max_grid_dim(2) = int(dim, int64)
650 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
651 accel%max_grid_dim(3) = int(dim, int64)
652#endif
653
654 if (mpi_grp_is_root(base_grp)) call device_info()
655
656 ! initialize the cache used to speed up allocations
657 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
658
659 ! now initialize the kernels
661
662#if defined(HAVE_HIP)
663 accel%debug_flag = "-g"
664#elif defined(HAVE_CUDA)
665 accel%debug_flag = "-lineinfo"
666#elif defined(HAVE_OPENCL)
667 accel%debug_flag = "-g"
668#endif
669
670 call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
671 call accel_kernel_start_call(set_zero_int, 'set_zero.cl', "set_zero_int")
672 call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
673 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
674 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
675 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
676 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
677 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
678 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
679 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
680 flags = ' -DRTYPE_DOUBLE')
681 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
682 flags = '-DRTYPE_COMPLEX')
683 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
684 flags = '-DRTYPE_DOUBLE')
685 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
686 flags = '-DRTYPE_COMPLEX')
687 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
688 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
689 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
690 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
691 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
692 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
693 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
694 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
695 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
696 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
697 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
698 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
699 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
700 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
701 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
702
703
704 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
705 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
706
707 !%Variable AccelBenchmark
708 !%Type logical
709 !%Default no
710 !%Section Execution::Accel
711 !%Description
712 !% If this variable is set to yes, Octopus will run some
713 !% routines to benchmark the performance of the accelerator device.
714 !%End
715 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
716
717 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
718
719 if (run_benchmark) then
721 end if
722
723 !%Variable GPUAwareMPI
724 !%Type logical
725 !%Section Execution::Accel
726 !%Description
727 !% If Octopus was compiled with GPU support and MPI support and if the MPI
728 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
729 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
730 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
731 !% going through the host memory.
732 !% The default is false, except when the configure switch --enable-cudampi is set, in which
733 !% case this variable is set to true.
734 !%End
735#ifdef HAVE_CUDA_MPI
736 default = .true.
737#else
738 default = .false.
739#endif
740 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
741 if (accel%cuda_mpi) then
742#ifndef HAVE_CUDA_MPI
743 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
744 call messages_warning()
745#endif
746 call messages_write("Using GPU-aware MPI.")
747 call messages_info()
748 end if
749
750
751 !%Variable AllowCPUonly
752 !%Type logical
753 !%Section Execution::Accel
754 !%Description
755 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
756 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
757 !% code execution also in these cases.
758 !%End
759#if defined (HAVE_ACCEL)
760 default = .false.
761#else
762 default = .true.
763#endif
764 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
765
766
767 !%Variable InitializeGPUBuffers
768 !%Type logical
769 !%Section Execution::Accel
770 !%Description
771 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
772 !%End
773 call parse_variable(namespace, 'InitializeGPUBuffers', .false., accel%initialize_buffers)
774
775
776 call messages_print_with_emphasis(namespace=namespace)
777
778 pop_sub(accel_init)
779
780 contains
781
782#if defined(HAVE_OPENCL)
783 subroutine select_device(idevice)
784 integer, intent(inout) :: idevice
785 integer :: irank
786 character(len=256) :: device_name
787
788 push_sub(accel_init.select_device)
789
790 idevice = mod(base_grp%rank, ndevices)
791
792 call base_grp%barrier()
793 call messages_write('Info: CL device distribution:')
794 call messages_info()
795 do irank = 0, base_grp%size - 1
796 if (irank == base_grp%rank) then
797 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
798 call messages_write(' MPI node ')
799 call messages_write(base_grp%rank)
800 call messages_write(' -> CL device ')
801 call messages_write(idevice)
802 call messages_write(' : '//device_name)
803 call messages_info(all_nodes = .true.)
804 end if
805 call base_grp%barrier()
806 end do
807
809 end subroutine select_device
810#endif
811
812 subroutine device_info()
813#ifdef HAVE_OPENCL
814 integer(int64) :: val
815#endif
816#ifdef HAVE_CUDA
817 integer :: version
818#endif
819 integer :: major, minor
820 character(len=256) :: val_str
821
822 push_sub(accel_init.device_info)
823
824 call messages_new_line()
825 call messages_write('Selected device:')
826 call messages_new_line()
827
828#ifdef HAVE_OPENCL
829 call messages_write(' Framework : OpenCL')
830#endif
831#ifdef HAVE_CUDA
832#ifdef __HIP_PLATFORM_AMD__
833 call messages_write(' Framework : ROCm')
834#else
835 call messages_write(' Framework : CUDA')
836#endif
837#endif
838 call messages_info()
839
840#ifdef HAVE_CUDA
841 call messages_write(' Device type : GPU', new_line = .true.)
842#ifdef __HIP_PLATFORM_AMD__
843 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
844#else
845 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
846#endif
847#endif
848
849#ifdef HAVE_OPENCL
850 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
851 call messages_write(' Device type :')
852 select case (int(val, int32))
853 case (cl_device_type_gpu)
854 call messages_write(' GPU')
855 case (cl_device_type_cpu)
856 call messages_write(' CPU')
857 case (cl_device_type_accelerator)
858 call messages_write(' accelerator')
859 end select
860 call messages_new_line()
862 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
863 call messages_write(' Device vendor : '//trim(val_str))
864 call messages_new_line()
865#endif
866
867#ifdef HAVE_OPENCL
868 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
869#endif
870#ifdef HAVE_CUDA
871 call cuda_device_name(accel%device%cuda_device, val_str)
872#endif
873 call messages_write(' Device name : '//trim(val_str))
874 call messages_new_line()
875
876#ifdef HAVE_CUDA
877 call cuda_device_capability(accel%device%cuda_device, major, minor)
878#endif
879 call messages_write(' Cuda capabilities :')
880 call messages_write(major, fmt = '(i2)')
881 call messages_write('.')
882 call messages_write(minor, fmt = '(i1)')
883 call messages_new_line()
884
885 ! VERSION
886#ifdef HAVE_OPENCL
887 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
888 call messages_write(' Driver version : '//trim(val_str))
889#endif
890#ifdef HAVE_CUDA
891 call cuda_driver_version(version)
892 call messages_write(' Driver version : ')
893 call messages_write(version)
894#endif
895 call messages_new_line()
896
897
898#ifdef HAVE_OPENCL
899 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
900 call messages_write(' Compute units :')
901 call messages_write(val)
902 call messages_new_line()
903
904 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
905 call messages_write(' Clock frequency :')
906 call messages_write(val)
907 call messages_write(' GHz')
908 call messages_new_line()
909#endif
910
911 call messages_write(' Device memory :')
912 call messages_write(accel%global_memory_size, units=unit_megabytes)
913 call messages_new_line()
914
915 call messages_write(' Local/shared memory :')
916 call messages_write(accel%local_memory_size, units=unit_kilobytes)
917 call messages_new_line()
918
919
920#ifdef HAVE_OPENCL
921 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
922 call messages_write(' Max alloc size :')
923 call messages_write(val, units = unit_megabytes)
924 call messages_new_line()
925
926 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
927 call messages_write(' Device cache :')
928 call messages_write(val, units = unit_kilobytes)
929 call messages_new_line()
930
931 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
932 call messages_write(' Constant memory :')
933 call messages_write(val, units = unit_kilobytes)
934 call messages_new_line()
935#endif
936
937 call messages_write(' Max. group/block size :')
938 call messages_write(accel%max_workgroup_size)
939 call messages_new_line()
940
941
942#ifdef HAVE_OPENCL
943 call messages_write(' Extension cl_khr_fp64 :')
944 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
945 call messages_new_line()
946
947 call messages_write(' Extension cl_amd_fp64 :')
948 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
949 call messages_new_line()
950
951 call messages_write(' Extension cl_khr_int64_base_atomics :')
952 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
953 call messages_new_line()
954
955#endif
956
957 call messages_info()
958
959
960 pop_sub(accel_init.device_info)
961 end subroutine device_info
962
963 end subroutine accel_init
964
965 ! ------------------------------------------
966#ifdef HAVE_OPENCL
967 integer function get_platform_id(platform_name) result(platform_id)
968 character(len=*), intent(in) :: platform_name
969
970 platform_id = cl_plat_invalid
971 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
972 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
973 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
974 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
975 end function get_platform_id
976#endif
977 ! ------------------------------------------
978
979 subroutine accel_end(namespace)
980 type(namespace_t), intent(in) :: namespace
981
982#ifdef HAVE_OPENCL
983 integer :: ierr
984#endif
985 integer(int64) :: hits, misses
986 real(real64) :: volume_hits, volume_misses
987 logical :: found
988 type(accel_mem_t) :: tmp
989
990 push_sub(accel_end)
991
992 if (accel_is_enabled()) then
993
994 do
995 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
996 if (.not. found) exit
997
998#ifdef HAVE_OPENCL
999 call clreleasememobject(tmp%mem, ierr)
1000 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1001#endif
1002#ifdef HAVE_CUDA
1003 call cuda_mem_free(tmp%mem)
1004#endif
1005 end do
1006
1007 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1008
1009 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1010
1011 call messages_new_line()
1012 call messages_write(' Number of allocations =')
1013 call messages_write(hits + misses, new_line = .true.)
1014 call messages_write(' Volume of allocations =')
1015 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1016 new_line = .true.)
1017 call messages_write(' Hit ratio =')
1018 if (hits + misses > 0) then
1019 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1020 else
1021 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1022 end if
1023 call messages_write('%', new_line = .true.)
1024 call messages_write(' Volume hit ratio =')
1025 if (volume_hits + volume_misses > 0) then
1026 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1027 else
1028 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1029 end if
1030 call messages_write('%')
1031 call messages_new_line()
1032 call messages_info()
1033
1034 call messages_print_with_emphasis(namespace=namespace)
1035 end if
1036
1038
1039#ifdef HAVE_CLBLAS
1040 call clblasteardown()
1041#endif
1042
1043#ifdef HAVE_CLFFT
1044 call clfftteardown()
1045#endif
1046
1047 if (accel_is_enabled()) then
1048#ifdef HAVE_CUDA
1049 call cublas_end(accel%cublas_handle)
1050 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1051 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1052 end if
1053#endif
1054
1055#ifdef HAVE_OPENCL
1056 call clreleasecommandqueue(accel%command_queue, ierr)
1058 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1059 call clreleasecontext(accel%context%cl_context, cl_status)
1060#endif
1061
1062 if (buffer_alloc_count /= 0) then
1063 call messages_write('Accel:')
1064 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1065 call messages_write(' in ')
1066 call messages_write(buffer_alloc_count)
1067 call messages_write(' buffers were not deallocated.')
1068 call messages_fatal()
1069 end if
1070
1071 end if
1072
1073 pop_sub(accel_end)
1074 end subroutine accel_end
1075
1076 ! ------------------------------------------
1077
1078 integer(int64) function accel_padded_size_i8(nn) result(psize)
1079 integer(int64), intent(in) :: nn
1080
1081 integer(int64) :: modnn, bsize
1082
1083 psize = nn
1084
1085 if (accel_is_enabled()) then
1086
1087 bsize = accel_max_workgroup_size()
1088
1089 psize = nn
1090 modnn = mod(nn, bsize)
1091 if (modnn /= 0) psize = psize + bsize - modnn
1092
1093 end if
1094
1095 end function accel_padded_size_i8
1096
1097 ! ------------------------------------------
1098
1099 integer(int32) function accel_padded_size_i4(nn) result(psize)
1100 integer(int32), intent(in) :: nn
1101
1102 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1103
1104 end function accel_padded_size_i4
1105
1106 ! ------------------------------------------
1107
1108 subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
1109 type(accel_mem_t), intent(inout) :: this
1110 integer, intent(in) :: flags
1111 type(type_t), intent(in) :: type
1112 integer, intent(in) :: size
1113 logical, optional, intent(in) :: set_zero
1114
1115 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero)
1116 end subroutine accel_create_buffer_4
1117
1118 ! ------------------------------------------
1119
1120 subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
1121 type(accel_mem_t), intent(inout) :: this
1122 integer, intent(in) :: flags
1123 type(type_t), intent(in) :: type
1124 integer(int64), intent(in) :: size
1125 logical, optional, intent(in) :: set_zero
1126
1127 integer(int64) :: fsize
1128 logical :: found
1129#ifdef HAVE_OPENCL
1130 integer :: ierr
1131#endif
1132
1133 push_sub(accel_create_buffer_8)
1134
1135 this%type = type
1136 this%size = size
1137 this%flags = flags
1138 fsize = int(size, int64)*types_get_size(type)
1139 this%allocated = .true.
1140
1141 if (fsize > 0) then
1142
1143 call alloc_cache_get(memcache, fsize, found, this%mem)
1144
1145 if (.not. found) then
1146#ifdef HAVE_OPENCL
1147 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1148 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1149#endif
1150#ifdef HAVE_CUDA
1151 call cuda_mem_alloc(this%mem, fsize)
1152#endif
1153 end if
1154
1155 buffer_alloc_count = buffer_alloc_count + 1
1156 allocated_mem = allocated_mem + fsize
1157
1158 end if
1159
1160 if(optional_default(set_zero, accel%initialize_buffers)) then
1161 call accel_set_buffer_to_zero_i8(this, type, size)
1162 endif
1163
1164 pop_sub(accel_create_buffer_8)
1165 end subroutine accel_create_buffer_8
1166
1167 ! ------------------------------------------
1168
1169 subroutine accel_release_buffer(this)
1170 type(accel_mem_t), intent(inout) :: this
1171
1172#ifdef HAVE_OPENCL
1173 integer :: ierr
1174#endif
1175 logical :: put
1176 integer(int64) :: fsize
1178 push_sub(accel_release_buffer)
1179
1180 if (this%size > 0) then
1181
1182 fsize = int(this%size, int64)*types_get_size(this%type)
1183
1184 call alloc_cache_put(memcache, fsize, this%mem, put)
1185
1186 if (.not. put) then
1187#ifdef HAVE_OPENCL
1188 call clreleasememobject(this%mem, ierr)
1189 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1190#endif
1191#ifdef HAVE_CUDA
1192 call cuda_mem_free(this%mem)
1193#endif
1194 end if
1195
1196 buffer_alloc_count = buffer_alloc_count - 1
1197 allocated_mem = allocated_mem + fsize
1199 end if
1200
1201 this%size = 0
1202 this%flags = 0
1203
1204 this%allocated = .false.
1205
1206 pop_sub(accel_release_buffer)
1207 end subroutine accel_release_buffer
1208
1209 ! ------------------------------------------
1210
1211 logical pure function accel_buffer_is_allocated(this) result(allocated)
1212 type(accel_mem_t), intent(in) :: this
1213
1214 allocated = this%allocated
1215 end function accel_buffer_is_allocated
1216
1217 ! -----------------------------------------
1218
1219 subroutine accel_finish()
1220#ifdef HAVE_OPENCL
1221 integer :: ierr
1222#endif
1223
1224 ! no push_sub, called too frequently
1225
1226 if (accel_is_enabled()) then
1227#ifdef HAVE_OPENCL
1228 call clfinish(accel%command_queue, ierr)
1229 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1230#endif
1231#ifdef HAVE_CUDA
1233#endif
1234 end if
1235 end subroutine accel_finish
1236
1237 ! ------------------------------------------
1238
1239 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1240 type(accel_kernel_t), intent(inout) :: kernel
1241 integer, intent(in) :: narg
1242 type(accel_mem_t), intent(in) :: buffer
1243
1244#ifdef HAVE_OPENCL
1245 integer :: ierr
1246#endif
1248 assert(accel_buffer_is_allocated(buffer))
1249
1250 ! no push_sub, called too frequently
1251#ifdef HAVE_OPENCL
1252 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1253 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1254#endif
1255
1256#ifdef HAVE_CUDA
1257 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1258#endif
1259
1260 end subroutine accel_set_kernel_arg_buffer
1261
1262 ! ------------------------------------------
1263
1264 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1265 type(accel_kernel_t), intent(inout) :: kernel
1266 integer, intent(in) :: narg
1267 type(type_t), intent(in) :: type
1268 integer, intent(in) :: size
1269
1270#ifdef HAVE_OPENCL
1271 integer :: ierr
1272#endif
1273 integer(int64) :: size_in_bytes
1274
1276
1277
1278 size_in_bytes = int(size, int64)*types_get_size(type)
1279
1280 if (size_in_bytes > accel%local_memory_size) then
1281 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1282 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1283 call messages_fatal(2)
1284 else if (size_in_bytes <= 0) then
1285 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1286 call messages_fatal(1)
1287 end if
1288
1289#ifdef HAVE_CUDA
1290 kernel%cuda_shared_mem = size_in_bytes
1291#endif
1292
1293#ifdef HAVE_OPENCL
1294 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1295 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1296#endif
1299 end subroutine accel_set_kernel_arg_local
1300
1301 ! ------------------------------------------
1302
1303 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1304 type(accel_kernel_t), intent(inout) :: kernel
1305 integer(int64), intent(in) :: globalsizes(:)
1306 integer(int64), intent(in) :: localsizes(:)
1307
1308 integer :: dim
1309#ifdef HAVE_OPENCL
1310 integer :: ierr
1311#endif
1312 integer(int64) :: gsizes(1:3)
1313 integer(int64) :: lsizes(1:3)
1314
1315 ! no push_sub, called too frequently
1316
1317 ! cuda needs all dimensions
1318 gsizes = 1
1319 lsizes = 1
1320
1321 dim = ubound(globalsizes, dim=1)
1322
1323 assert(dim == ubound(localsizes, dim=1))
1324
1325 ! if one size is zero, there is nothing to do
1326 if (any(globalsizes == 0)) return
1327
1328 assert(all(localsizes > 0))
1329 assert(all(localsizes <= accel_max_workgroup_size()))
1330 assert(all(mod(globalsizes, localsizes) == 0))
1331
1332 gsizes(1:dim) = globalsizes(1:dim)
1333 lsizes(1:dim) = localsizes(1:dim)
1334
1335#ifdef HAVE_OPENCL
1336 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1337 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1338#endif
1339
1340#ifdef HAVE_CUDA
1341 ! Maximum dimension of a block
1342 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1343 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1344 message(2) = "The following conditions should be fulfilled:"
1345 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1346 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1347 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1348 message(6) = "This is an internal error, please contact the developers."
1349 call messages_fatal(6)
1350 end if
1351
1352
1353 ! Maximum number of threads per block
1354 if (product(lsizes) > accel_max_workgroup_size()) then
1355 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1356 message(2) = "The following condition should be fulfilled:"
1357 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1358 message(4) = "This is an internal error, please contact the developers."
1359 call messages_fatal(4)
1360 end if
1361
1362 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1363
1364 ! Maximum dimensions of the grid of thread block
1365 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1366 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1367 message(2) = "The following conditions should be fulfilled:"
1368 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1369 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1370 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1371 message(6) = "This is an internal error, please contact the developers."
1372 call messages_fatal(6)
1373 end if
1374
1375 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1376
1377 kernel%cuda_shared_mem = 0
1378#endif
1379
1380 end subroutine accel_kernel_run_8
1382 ! -----------------------------------------------
1383
1384 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1385 type(accel_kernel_t), intent(inout) :: kernel
1386 integer, intent(in) :: globalsizes(:)
1387 integer, intent(in) :: localsizes(:)
1388
1389 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1390
1391 end subroutine accel_kernel_run_4
1392
1393 ! -----------------------------------------------
1394
1395 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1396 max_workgroup_size = accel%max_workgroup_size
1397 end function accel_max_workgroup_size
1398
1399 ! -----------------------------------------------
1400
1401 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1402 type(accel_kernel_t), intent(inout) :: kernel
1403
1404#ifdef HAVE_OPENCL
1405 integer(int64) :: workgroup_size8
1406 integer :: ierr
1407#endif
1408#ifdef HAVE_CUDA
1409 integer :: max_workgroup_size
1410#endif
1411
1412 workgroup_size = 0
1413
1414#ifdef HAVE_OPENCL
1415 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1416 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1417 workgroup_size = workgroup_size8
1418#endif
1419
1420#ifdef HAVE_CUDA
1421 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1422 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1423 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1424 "than available on the device (", accel%max_workgroup_size, ")"
1425 call messages_info(1)
1426 end if
1427 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1428 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1429 workgroup_size = 256
1430 ! make sure we do not use more threads per block than available for this kernel
1431 workgroup_size = min(workgroup_size, max_workgroup_size)
1432#endif
1433
1434 end function accel_kernel_workgroup_size
1435
1436 ! -----------------------------------------------
1437
1438#ifdef HAVE_OPENCL
1439 subroutine opencl_build_program(prog, filename, flags)
1440 type(cl_program), intent(inout) :: prog
1441 character(len=*), intent(in) :: filename
1442 character(len=*), optional, intent(in) :: flags
1443
1444 character(len = 1000) :: string
1445 character(len = 256) :: share_string
1446 integer :: ierr, ierrlog, iunit, irec, newlen
1447
1448 push_sub(opencl_build_program)
1449
1450 string = '#include "'//trim(filename)//'"'
1451
1452 call messages_write("Building CL program '"//trim(filename)//"'.")
1453 call messages_info(debug_only=.true.)
1454
1455 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1456 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1457
1458 ! build the compilation flags
1459 string='-w'
1460 ! full optimization
1461 string=trim(string)//' -cl-denorms-are-zero'
1462 ! The following flag gives an error with the Xeon Phi
1463 ! string=trim(string)//' -cl-strict-aliasing'
1464 string=trim(string)//' -cl-mad-enable'
1465 string=trim(string)//' -cl-unsafe-math-optimizations'
1466 string=trim(string)//' -cl-finite-math-only'
1467 string=trim(string)//' -cl-fast-relaxed-math'
1468
1469 share_string='-I'//trim(conf%share)//'/opencl/'
1470
1471 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1472 string = trim(string)//' -DEXT_KHR_FP64'
1473 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1474 string = trim(string)//' -DEXT_AMD_FP64'
1475 else
1476 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1477 call messages_fatal()
1478 end if
1480 if (accel_use_shared_mem()) then
1481 string = trim(string)//' -DSHARED_MEM'
1482 end if
1483
1484 if (present(flags)) then
1485 string = trim(string)//' '//trim(flags)
1486 end if
1487
1488 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1489 call messages_write(' '//trim(share_string)//"'.")
1490 call messages_info(debug_only=.true.)
1491
1492 string = trim(string)//' '//trim(share_string)
1493
1494 call clbuildprogram(prog, trim(string), ierr)
1495
1496 if(ierr /= cl_success) then
1497 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1498 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1499
1500 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1501 newlen = scan(string, achar(010), back = .true.) - 1
1502 if (newlen >= 0) string = string(1:newlen)
1503
1504 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1505
1506 call opencl_print_error(ierr, "clBuildProgram")
1507 end if
1508
1509 pop_sub(opencl_build_program)
1510 end subroutine opencl_build_program
1511#endif
1512
1513 ! -----------------------------------------------
1514#ifdef HAVE_OPENCL
1515 subroutine opencl_release_program(prog)
1516 type(cl_program), intent(inout) :: prog
1518 integer :: ierr
1519
1520 push_sub(opencl_release_program)
1521
1522 call clreleaseprogram(prog, ierr)
1523 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1524
1525 pop_sub(opencl_release_program)
1526 end subroutine opencl_release_program
1527#endif
1528
1529 ! -----------------------------------------------
1530
1531#ifdef HAVE_OPENCL
1532 subroutine opencl_release_kernel(prog)
1533 type(cl_kernel), intent(inout) :: prog
1534
1535 integer :: ierr
1536
1537 push_sub(opencl_release_kernel)
1538
1539#ifdef HAVE_OPENCL
1540 call clreleasekernel(prog, ierr)
1541 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1542#endif
1543
1544 pop_sub(opencl_release_kernel)
1545 end subroutine opencl_release_kernel
1546#endif
1547
1548#ifdef HAVE_OPENCL
1549 ! -----------------------------------------------
1550 subroutine opencl_create_kernel(kernel, prog, name)
1551 type(cl_kernel), intent(inout) :: kernel
1552 type(cl_program), intent(inout) :: prog
1553 character(len=*), intent(in) :: name
1554
1555 integer :: ierr
1556
1557 push_sub(opencl_create_kernel)
1558 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1559
1560#ifdef HAVE_OPENCL
1561 kernel = clcreatekernel(prog, name, ierr)
1562 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1563#endif
1564
1565 call profiling_out("CL_BUILD_KERNEL")
1566 pop_sub(opencl_create_kernel)
1567 end subroutine opencl_create_kernel
1568#endif
1569
1570 ! ------------------------------------------------
1571#ifdef HAVE_OPENCL
1572 subroutine opencl_print_error(ierr, name)
1573 integer, intent(in) :: ierr
1574 character(len=*), intent(in) :: name
1575
1576 character(len=40) :: errcode
1577
1578 push_sub(opencl_print_error)
1579
1580 select case (ierr)
1581 case (cl_success); errcode = 'CL_SUCCESS '
1582 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1583 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1584 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1585 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1586 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1587 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1588 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1589 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1590 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1591 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1592 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1593 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1594 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1595 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1596 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1597 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1598 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1599 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1600 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1601 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1602 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1603 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1604 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1605 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1606 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1607 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1608 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1609 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1610 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1611 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1612 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1613 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1614 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1615 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1616 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1617 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1618 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1619 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1620 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1621 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1622 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1623 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1624 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1625 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1626 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1627 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1628 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1629 case default
1630 write(errcode, '(i10)') ierr
1631 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1632 end select
1633
1634 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1635 call messages_fatal(1)
1636
1637 pop_sub(opencl_print_error)
1638 end subroutine opencl_print_error
1639#endif
1640
1641 ! ----------------------------------------------------
1642
1643 subroutine clblas_print_error(ierr, name)
1644 integer, intent(in) :: ierr
1645 character(len=*), intent(in) :: name
1646
1647 character(len=40) :: errcode
1648
1649 push_sub(clblas_print_error)
1650#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1651 select case (ierr)
1652 case (clblassuccess); errcode = 'clblasSuccess'
1653 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1654 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1655 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1656 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1657 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1658 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1659 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1660 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1661 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1662 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1663 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1664 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1665 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1666 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1667 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1668 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1669 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1670 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1671 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1672 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1673 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1674 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1675 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1676 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1677 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1678 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1679 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1680 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1681 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1682#ifdef HAVE_CLBLAST
1683 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1684 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1685 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1686 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1687 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1688 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1689 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1690 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1691 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1692 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1693 case (clblastunknownerror); errcode = 'clblastUnknownError'
1694 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1695#endif
1696
1697 case default
1698 write(errcode, '(i10)') ierr
1699 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1700 end select
1701#endif
1702
1703 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1704 call messages_fatal(1)
1705
1706 pop_sub(clblas_print_error)
1707 end subroutine clblas_print_error
1708
1709 ! ----------------------------------------------------
1710 subroutine clfft_print_error(ierr, name)
1711 integer, intent(in) :: ierr
1712 character(len=*), intent(in) :: name
1713
1714 character(len=40) :: errcode
1715
1716 push_sub(clfft_print_error)
1717#ifdef HAVE_CLFFT
1718 select case (ierr)
1719 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1720 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1721 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1722 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1723 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1724 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1725 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1726 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1727 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1728 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1729 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1730 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1731 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1732 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1733 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1734 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1735 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1736 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1737 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1738 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1739 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1740 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1741 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1742 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1743 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1744 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1745 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1746 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1747 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1748 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1749 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1750 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1751 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1752 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1753 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1754 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1755 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1756 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1757 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1758 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1759 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1760 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1761 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1762 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1763 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1764 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1765 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1766 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1767 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1768 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1769 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1770 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1771 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1772 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1773 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1774 case default
1775 write(errcode, '(i10)') ierr
1776 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1777 end select
1778#endif
1779
1780 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1781 call messages_fatal(1)
1782
1783 pop_sub(clfft_print_error)
1784 end subroutine clfft_print_error
1785
1786 ! ----------------------------------------------------
1787
1788#ifdef HAVE_OPENCL
1789 logical function f90_cl_device_has_extension(device, extension) result(has)
1790 type(cl_device_id), intent(inout) :: device
1791 character(len=*), intent(in) :: extension
1792
1793 integer :: cl_status
1794 character(len=2048) :: all_extensions
1795
1796#ifdef HAVE_OPENCL
1797 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1798#endif
1799
1800 has = index(all_extensions, extension) /= 0
1801
1802 end function f90_cl_device_has_extension
1803#endif
1804
1805 ! ----------------------------------------------------
1806
1807 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1808 type(accel_mem_t), intent(inout) :: buffer
1809 type(type_t), intent(in) :: type
1810 integer(int64), intent(in) :: nval
1811 integer(int64), optional, intent(in) :: offset
1812 logical, optional, intent(in) :: async
1813
1814 integer :: bsize
1815 integer(int64) :: nval_real, offset_real
1816 type(accel_kernel_t), pointer :: kernel
1817
1818 if (nval == 0) return
1819
1821
1822 nval_real = nval
1823 if (type == type_cmplx) nval_real = nval_real * 2
1824 if (present(offset)) then
1825 offset_real = offset
1826 if (type == type_cmplx) offset_real = offset_real * 2
1827 else
1828 offset_real = 0_int64
1829 end if
1830
1831 assert(nval_real > 0)
1832
1833 if (type == type_integer) then
1834 kernel => set_zero_int
1835 else
1836 kernel => set_zero
1837 end if
1838 call accel_set_kernel_arg(kernel, 0, nval_real)
1839 call accel_set_kernel_arg(kernel, 1, offset_real)
1840 call accel_set_kernel_arg(kernel, 2, buffer)
1841
1842 bsize = accel_kernel_workgroup_size(kernel)
1843
1844 call accel_kernel_run(kernel, (/ nval_real /), (/ 1_int64 /))
1845
1846 if(.not. optional_default(async, .false.)) call accel_finish()
1847
1849 end subroutine accel_set_buffer_to_zero_i8
1850
1851 ! ----------------------------------------------------
1852
1853 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1854 type(accel_mem_t), intent(inout) :: buffer
1855 type(type_t), intent(in) :: type
1856 integer(int32), intent(in) :: nval
1857 integer(int32), optional, intent(in) :: offset
1858 logical, optional, intent(in) :: async
1859
1861
1862 if (present(offset)) then
1863 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1864 else
1865 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1866 end if
1869 end subroutine accel_set_buffer_to_zero_i4
1870
1871 ! ----------------------------------------------------
1872
1873 subroutine opencl_check_bandwidth()
1874 integer :: itime
1875 integer, parameter :: times = 10
1876 integer :: size
1877 real(real64) :: time, stime
1878 real(real64) :: read_bw, write_bw
1879 type(accel_mem_t) :: buff
1880 real(real64), allocatable :: data(:)
1881
1882 call messages_new_line()
1883 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1884 call messages_new_line()
1885 call messages_info()
1886
1887 call messages_write(' Buffer size Read bw Write bw')
1888 call messages_new_line()
1889 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1890 call messages_info()
1891
1892 size = 15000
1893 do
1894 safe_allocate(data(1:size))
1895 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1896
1897 stime = loct_clock()
1898 do itime = 1, times
1899 call accel_write_buffer(buff, size, data)
1900 call accel_finish()
1901 end do
1902 time = (loct_clock() - stime)/real(times, real64)
1903
1904 write_bw = real(size, real64) *8.0_real64/time
1905
1906 stime = loct_clock()
1907 do itime = 1, times
1908 call accel_read_buffer(buff, size, data)
1909 end do
1910 call accel_finish()
1911
1912 time = (loct_clock() - stime)/real(times, real64)
1913 read_bw = real(size, real64) *8.0_real64/time
1914
1915 call messages_write(size*8.0_real64/1024.0_real64**2)
1916 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1917 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1918 call messages_info()
1919
1920 call accel_release_buffer(buff)
1921
1922 safe_deallocate_a(data)
1923
1924 size = int(size*2.0)
1925
1926 if (size > 50000000) exit
1927 end do
1928 end subroutine opencl_check_bandwidth
1929
1930 ! ----------------------------------------------------
1932 logical pure function accel_use_shared_mem() result(use_shared_mem)
1933
1934 use_shared_mem = accel%shared_mem
1935
1936 end function accel_use_shared_mem
1937
1938 !------------------------------------------------------------
1939
1940 subroutine accel_kernel_global_init()
1941
1942 push_sub(accel_kernel_global_init)
1943
1944 nullify(head)
1945
1946 call cuda_module_map_init(accel%module_map)
1947
1949 end subroutine accel_kernel_global_init
1950
1951 !------------------------------------------------------------
1952
1953 subroutine accel_kernel_global_end()
1954 type(accel_kernel_t), pointer :: next_head
1955
1956 push_sub(accel_kernel_global_end)
1957
1958 do
1959 if (.not. associated(head)) exit
1960 next_head => head%next
1962 head => next_head
1963 end do
1964
1965 if (accel_is_enabled()) then
1966 call cuda_module_map_end(accel%module_map)
1967 end if
1968
1970 end subroutine accel_kernel_global_end
1971
1972 !------------------------------------------------------------
1973
1974 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1975 type(accel_kernel_t), intent(inout) :: this
1976 character(len=*), intent(in) :: file_name
1977 character(len=*), intent(in) :: kernel_name
1978 character(len=*), optional, intent(in) :: flags
1979
1980#ifdef HAVE_OPENCL
1981 type(cl_program) :: prog
1982#endif
1983#ifdef HAVE_CUDA
1984 character(len=1000) :: all_flags
1985#endif
1986
1987 push_sub(accel_kernel_build)
1988
1989 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1990
1991#ifdef HAVE_CUDA
1992 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
1993
1994 if (accel_use_shared_mem()) then
1995 all_flags = trim(all_flags)//' -DSHARED_MEM'
1996 end if
1997
1998 if (present(flags)) then
1999 all_flags = trim(all_flags)//' '//trim(flags)
2000 end if
2001
2002 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
2003
2004 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2005 call cuda_alloc_arg_array(this%arguments)
2006
2007 this%cuda_shared_mem = 0
2008#endif
2009
2010#ifdef HAVE_OPENCL
2011 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2012 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2013 call opencl_release_program(prog)
2014#endif
2015
2016 this%initialized = .true.
2017 this%kernel_name = trim(kernel_name)
2019 call profiling_out("ACCEL_COMPILE")
2020
2021 pop_sub(accel_kernel_build)
2022 end subroutine accel_kernel_build
2023
2024 !------------------------------------------------------------
2025
2026 subroutine accel_kernel_end(this)
2027 type(accel_kernel_t), intent(inout) :: this
2028#ifdef HAVE_OPENCL
2029 integer :: ierr
2030#endif
2032 push_sub(accel_kernel_end)
2033
2034#ifdef HAVE_CUDA
2035 call cuda_free_arg_array(this%arguments)
2036 call cuda_release_kernel(this%cuda_kernel)
2037 ! modules are not released here, since they are not associated to a kernel
2038#endif
2039
2040#ifdef HAVE_OPENCL
2041 call clreleasekernel(this%kernel, ierr)
2042 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2043#endif
2044 this%initialized = .false.
2045
2046 pop_sub(accel_kernel_end)
2047 end subroutine accel_kernel_end
2048
2049 !------------------------------------------------------------
2050
2051 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2052 type(accel_kernel_t), target, intent(inout) :: this
2053 character(len=*), intent(in) :: file_name
2054 character(len=*), intent(in) :: kernel_name
2055 character(len=*), optional, intent(in) :: flags
2056
2057 push_sub(accel_kernel_start_call)
2058
2059 if (.not. this%initialized) then
2060 call accel_kernel_build(this, file_name, kernel_name, flags)
2061 this%next => head
2062 head => this
2063 end if
2064
2066 end subroutine accel_kernel_start_call
2067
2068 !--------------------------------------------------------------
2069
2070 integer(int64) pure function accel_global_memory_size() result(size)
2071
2072 size = accel%global_memory_size
2073
2074 end function accel_global_memory_size
2075
2076 !--------------------------------------------------------------
2077
2078 integer(int64) pure function accel_local_memory_size() result(size)
2079
2080 size = accel%local_memory_size
2081
2082 end function accel_local_memory_size
2083
2084 !--------------------------------------------------------------
2085
2086 integer pure function accel_max_size_per_dim(dim) result(size)
2087 integer, intent(in) :: dim
2088
2089 size = 0
2090#ifdef HAVE_OPENCL
2091 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2092 ! value of zero when multiplied by 2048 and converted to integer 4.
2093 if (dim == 1) size = 2**30
2094#endif
2095#ifdef HAVE_CUDA
2096 size = 32768
2097 if (dim == 1) size = 2**30
2098#endif
2099 end function accel_max_size_per_dim
2100
2101 ! ------------------------------------------------------
2102
2103 subroutine accel_set_stream(stream_number)
2104 integer, intent(in) :: stream_number
2105
2106 push_sub(accel_set_stream)
2107
2108 if (accel_is_enabled()) then
2109#ifdef HAVE_CUDA
2110 call cuda_set_stream(accel%cuda_stream, stream_number)
2111 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2112#endif
2113 end if
2114
2115 pop_sub(accel_set_stream)
2116 end subroutine accel_set_stream
2117
2118 ! ------------------------------------------------------
2119
2120 subroutine accel_get_stream(stream_number)
2121 integer, intent(inout) :: stream_number
2122
2123 push_sub(accel_get_stream)
2124
2125 if (accel_is_enabled()) then
2126#ifdef HAVE_CUDA
2127 call cuda_get_stream(stream_number)
2128#endif
2129 end if
2130
2131 pop_sub(accel_get_stream)
2132 end subroutine accel_get_stream
2133
2134 ! ------------------------------------------------------
2135
2138
2139 if (accel_is_enabled()) then
2140#ifdef HAVE_CUDA
2141 call cuda_synchronize_all_streams()
2142#endif
2143 end if
2144
2146 end subroutine accel_synchronize_all_streams
2147
2148 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2149 type(c_ptr), intent(in) :: buffer
2150 integer(int64), intent(in) :: offset
2151 type(c_ptr) :: buffer_offset
2152
2154#ifdef HAVE_CUDA
2155 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2156#else
2157 ! this is needed to make the compiler happy for non-GPU compilations
2158 buffer_offset = buffer
2159#endif
2162
2163 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2164 type(c_ptr), intent(in) :: buffer
2165 integer(int64), intent(in) :: offset
2166 type(c_ptr) :: buffer_offset
2167
2169#ifdef HAVE_CUDA
2170 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2171#else
2172 ! this is needed to make the compiler happy for non-GPU compilations
2173 buffer_offset = buffer
2174#endif
2177
2178 subroutine accel_clean_pointer(buffer)
2179 type(c_ptr), intent(in) :: buffer
2180
2182#ifdef HAVE_CUDA
2183 call cuda_clean_pointer(buffer)
2184#endif
2185 pop_sub(accel_clean_pointer)
2186 end subroutine accel_clean_pointer
2187
2191 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2192 integer(int64), intent(in) :: size
2193 integer(int64), intent(out) :: grid_size
2194 integer(int64), intent(out) :: thread_block_size
2195
2196 push_sub(accel_get_unfolded_size)
2197#ifdef __HIP_PLATFORM_AMD__
2198 ! not benefitial for AMD chips
2199 grid_size = size
2200 thread_block_size = size
2201#else
2202 grid_size = size * accel%warp_size
2203 thread_block_size = accel%warp_size
2204#endif
2206 end subroutine accel_get_unfolded_size
2207
2208#include "undef.F90"
2209#include "real.F90"
2210#include "accel_inc.F90"
2211
2212#include "undef.F90"
2213#include "complex.F90"
2214#include "accel_inc.F90"
2215
2216#include "undef.F90"
2217#include "integer.F90"
2218#include "accel_inc.F90"
2219
2220#include "undef.F90"
2221#include "integer8.F90"
2222#include "accel_inc.F90"
2223
2224end module accel_oct_m
2225
2226!! Local Variables:
2227!! mode: f90
2228!! coding: utf-8
2229!! End:
subroutine select_device(idevice)
Definition: accel.F90:862
subroutine device_info()
Definition: accel.F90:891
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4680
integer, parameter opencl_accelerator
Definition: accel.F90:376
type(accel_kernel_t), target, save, public kernel_density_real
Definition: accel.F90:285
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3447
integer, parameter opencl_default
Definition: accel.F90:376
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:274
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:293
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:2257
subroutine accel_kernel_global_end()
Definition: accel.F90:2032
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:2270
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:411
subroutine iaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3839
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2751
subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
Definition: accel.F90:1199
subroutine laccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4425
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:2011
subroutine laccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4523
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3356
subroutine daccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2471
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:297
subroutine laccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4246
subroutine zaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3133
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
Definition: accel.F90:275
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4567
subroutine daccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2368
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2960
type(accel_kernel_t), target, save, public kernel_ghost_reorder
Definition: accel.F90:284
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4074
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4591
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:294
integer, parameter cl_plat_nvidia
Definition: accel.F90:383
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:2130
subroutine iaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3719
subroutine daccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2629
integer, parameter cl_plat_ati
Definition: accel.F90:383
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:2199
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:2149
subroutine daccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2669
subroutine iaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3800
subroutine daccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2528
subroutine zaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3035
subroutine zaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3095
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:296
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3331
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2864
subroutine laccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4466
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1343
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2842
subroutine zaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3076
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:1157
subroutine iaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3898
type(accel_kernel_t), target, save set_zero_int
Definition: accel.F90:305
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4029
subroutine, public accel_finish()
Definition: accel.F90:1298
subroutine opencl_check_bandwidth()
Definition: accel.F90:1952
subroutine accel_kernel_global_init()
Definition: accel.F90:2019
subroutine daccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2411
type(accel_kernel_t), target, save, public kernel_daxpy
Definition: accel.F90:277
subroutine laccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4363
subroutine opencl_release_program(prog)
Definition: accel.F90:1594
type(accel_kernel_t), target, save set_zero
Definition: accel.F90:304
subroutine zaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3234
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3961
subroutine laccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4266
type(accel_kernel_t), target, save, public zzmul
Definition: accel.F90:300
integer, parameter cl_plat_invalid
Definition: accel.F90:383
type(accel_kernel_t), target, save, public kernel_density_spinors
Definition: accel.F90:287
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4542
subroutine daccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2688
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4171
subroutine daccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2430
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2775
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1886
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3424
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1290
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine zaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3255
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1789
subroutine accel_kernel_end(this)
Definition: accel.F90:2105
subroutine iaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3681
integer, parameter opencl_gpu
Definition: accel.F90:376
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:295
subroutine iaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3819
subroutine opencl_release_kernel(prog)
Definition: accel.F90:1611
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3380
subroutine zaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2973
subroutine zaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3274
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2227
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3565
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3985
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:2165
type(accel_kernel_t), target, save, public dzmul
Definition: accel.F90:299
subroutine iaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3738
subroutine laccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4227
subroutine daccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2590
subroutine, public accel_release_buffer(this)
Definition: accel.F90:1248
type(accel_kernel_t), target, save, public kernel_phase_spiral
Definition: accel.F90:289
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4658
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4007
subroutine zaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3016
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2726
subroutine zaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3293
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1382
subroutine laccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4306
subroutine laccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4445
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
Definition: accel.F90:276
subroutine opencl_build_program(prog, filename, flags)
Definition: accel.F90:1518
subroutine laccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4325
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:2053
subroutine iaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3640
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:421
subroutine, public accel_end(namespace)
Definition: accel.F90:1058
subroutine iaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3757
subroutine opencl_create_kernel(kernel, prog, name)
Definition: accel.F90:1629
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:2215
subroutine iaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3879
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:2182
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:283
type(accel_kernel_t), target, save, public kernel_phase
Definition: accel.F90:288
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4635
subroutine zaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3312
subroutine laccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4504
type(accel_kernel_t), target, save, public kernel_zaxpy
Definition: accel.F90:278
subroutine laccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4485
integer, parameter cl_plat_amd
Definition: accel.F90:383
subroutine zaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3214
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:1178
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1932
subroutine daccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2509
subroutine iaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3700
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:298
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4052
subroutine daccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2490
subroutine iaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3578
pure logical function, public accel_is_enabled()
Definition: accel.F90:401
subroutine zaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3152
integer, parameter cl_plat_intel
Definition: accel.F90:383
integer, parameter, public accel_mem_write_only
Definition: accel.F90:183
subroutine daccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2707
subroutine daccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2650
subroutine laccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4287
subroutine zaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3195
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:273
logical function f90_cl_device_has_extension(device, extension)
Definition: accel.F90:1868
subroutine opencl_print_error(ierr, name)
Definition: accel.F90:1651
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2819
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1463
subroutine zaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3114
subroutine iaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3621
subroutine laccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4184
type(accel_kernel_t), target, save, public kernel_density_complex
Definition: accel.F90:286
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3402
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2242
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:2355
subroutine daccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2547
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1480
subroutine laccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4406
integer, parameter opencl_cpu
Definition: accel.F90:376
subroutine zaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3055
integer function get_platform_id(platform_name)
Definition: accel.F90:1046
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1722
type(accel_t), public accel
Definition: accel.F90:270
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4613
subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
Definition: accel.F90:1187
subroutine laccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4344
subroutine daccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2609
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3936
subroutine daccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2450
integer, public cl_status
Definition: accel.F90:391
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3469
subroutine iaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3860
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:282
subroutine iaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3660
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:2157
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1318
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1474
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2797
type(accel_kernel_t), pointer head
Definition: accel.F90:395
subroutine iaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3917
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
integer, parameter, public clblassuccess
Definition: clblas.F90:193
real(real64), parameter, public m_zero
Definition: global.F90:188
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:920
character(len=512), private msg
Definition: messages.F90:165
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:537
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1045
subroutine, public messages_new_line()
Definition: messages.F90:1134
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:414
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:616
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
Definition: mpi.F90:430
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:623
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:552
integer pure function, public types_get_size(this)
Definition: types.F90:152
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)