Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
47 use mpi_oct_m
49 use types_oct_m
50 use parser_oct_m
53
54 implicit none
55
56 private
57
58 public :: &
63 accel_t, &
66 accel_init, &
67 accel_end, &
96
97#ifdef HAVE_OPENCL
98 integer, public, parameter :: &
99 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
100 accel_mem_read_write = cl_mem_read_write, &
101 accel_mem_write_only = cl_mem_write_only
102#else
103 integer, public, parameter :: &
104 ACCEL_MEM_READ_ONLY = 0, &
107#endif
108
109 type accel_context_t
110 ! Components are public by default
111#ifdef HAVE_OPENCL
112 type(cl_context) :: cl_context
113#elif defined(HAVE_CUDA)
114 type(c_ptr) :: cuda_context
115#else
116 integer :: dummy
117#endif
118 end type accel_context_t
119
120 type accel_device_t
121 ! Components are public by default
122#ifdef HAVE_OPENCL
123 type(cl_device_id) :: cl_device
124#elif defined(HAVE_CUDA)
125 type(c_ptr) :: cuda_device
126#else
127 integer :: dummy
128#endif
129 end type accel_device_t
130
131 type accel_t
132 ! Components are public by default
133 type(accel_context_t) :: context
134 type(accel_device_t) :: device
135#ifdef HAVE_OPENCL
136 type(cl_command_queue) :: command_queue
137#endif
138 type(c_ptr) :: cublas_handle
139 type(c_ptr) :: cuda_stream
140 type(c_ptr) :: module_map
141 integer :: max_workgroup_size
142 integer(int64) :: local_memory_size
143 integer(int64) :: global_memory_size
144 logical :: enabled
145 logical :: allow_CPU_only
146 logical :: shared_mem
147 logical :: cuda_mpi
148 integer :: warp_size
149 logical :: initialize_buffers
150 character(len=32) :: debug_flag
151 integer(int64) :: max_block_dim(3)
152 integer(int64) :: max_grid_dim(3)
153 end type accel_t
154
155 type accel_mem_t
156 ! Components are public by default
157#ifdef HAVE_OPENCL
158 type(cl_mem) :: mem
159#else
160 type(c_ptr) :: mem
161#endif
162 integer(c_size_t) :: size = 0
163 type(type_t) :: type
164 integer :: flags = 0
165 logical :: allocated = .false.
166 end type accel_mem_t
167
168 type accel_kernel_t
169 ! Components are public by default
170#ifdef HAVE_OPENCL
171 type(cl_kernel) :: kernel
172#endif
173#ifdef HAVE_CUDA
174 type(c_ptr) :: cuda_kernel
175 type(c_ptr) :: cuda_module
176 type(c_ptr) :: arguments
177#endif
178 integer(int64) :: cuda_shared_mem
179 logical :: initialized = .false.
180 type(accel_kernel_t), pointer :: next
181 integer :: arg_count
182 character(len=128) :: kernel_name
184
185 type(accel_t), public :: accel
186
187 ! the kernels
188 type(accel_kernel_t), public, target, save :: kernel_vpsi
189 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
190 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
191 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
192 type(accel_kernel_t), public, target, save :: kernel_daxpy
193 type(accel_kernel_t), public, target, save :: kernel_zaxpy
194 type(accel_kernel_t), public, target, save :: kernel_copy
195 type(accel_kernel_t), public, target, save :: dpack
196 type(accel_kernel_t), public, target, save :: zpack
197 type(accel_kernel_t), public, target, save :: dunpack
198 type(accel_kernel_t), public, target, save :: zunpack
199 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
200 type(accel_kernel_t), public, target, save :: kernel_density_real
201 type(accel_kernel_t), public, target, save :: kernel_density_complex
202 type(accel_kernel_t), public, target, save :: kernel_density_spinors
203 type(accel_kernel_t), public, target, save :: kernel_phase
204 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
205 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
206 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
207 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
208 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
209 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
210 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
211 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
212 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
213 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
214 type(accel_kernel_t), public, target, save :: dzmul
215 type(accel_kernel_t), public, target, save :: zzmul
216 type(accel_kernel_t), public, target, save :: set_one
217
218 ! kernels used locally
220
223 end interface accel_padded_size
231 end interface accel_kernel_run
251 end interface accel_write_buffer
252
262 end interface accel_read_buffer
265 module procedure &
272 end interface accel_set_kernel_arg
290 integer, parameter :: &
297 integer, parameter :: &
302 cl_plat_intel = -5
303
304 ! a "convenience" public variable
305 integer, public :: cl_status
307 integer :: buffer_alloc_count
308 integer(int64) :: allocated_mem
309 type(accel_kernel_t), pointer :: head
310 type(alloc_cache_t) :: memcache
311
312contains
313
314 pure logical function accel_is_enabled() result(enabled)
315#ifdef HAVE_ACCEL
316 enabled = accel%enabled
317#else
318 enabled = .false.
319#endif
320 end function accel_is_enabled
321
322 ! ------------------------------------------
323
324 pure logical function accel_allow_cpu_only() result(allow)
325#ifdef HAVE_ACCEL
326 allow = accel%allow_CPU_only
327#else
328 allow = .true.
329#endif
330 end function accel_allow_cpu_only
331
332 ! ------------------------------------------
333
334 subroutine accel_init(base_grp, namespace)
335 type(mpi_grp_t), intent(inout) :: base_grp
336 type(namespace_t), intent(in) :: namespace
337
338 logical :: disable, default, run_benchmark
339 integer :: idevice, iplatform
340#ifdef HAVE_OPENCL
341 integer :: device_type
342 integer :: cl_status, idev
343 integer :: ndevices, ret_devices, nplatforms, iplat
344 character(len=256) :: device_name
345 type(cl_platform_id) :: platform_id
346 type(cl_program) :: prog
347 type(cl_platform_id), allocatable :: allplatforms(:)
348 type(cl_device_id), allocatable :: alldevices(:)
349#endif
350#ifdef HAVE_CUDA
351 integer :: dim
352#ifdef HAVE_MPI
353 character(len=256) :: sys_name
354#endif
355#endif
356
357 push_sub(accel_init)
358
359 buffer_alloc_count = 0
360
361 !%Variable DisableAccel
362 !%Type logical
363 !%Default yes
364 !%Section Execution::Accel
365 !%Description
366 !% If Octopus was compiled with OpenCL or CUDA support, it will
367 !% try to initialize and use an accelerator device. By setting this
368 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
369 !%End
370 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
371#ifdef HAVE_ACCEL
372 default = .false.
373#else
374 default = .true.
375#endif
376 call parse_variable(namespace, 'DisableAccel', default, disable)
377 accel%enabled = .not. disable
378
379#ifndef HAVE_ACCEL
380 if (accel%enabled) then
381 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
383 end if
384#endif
385
386 if (.not. accel_is_enabled()) then
387 pop_sub(accel_init)
388 return
389 end if
391 !%Variable AccelPlatform
392 !%Type integer
393 !%Default 0
394 !%Section Execution::Accel
395 !%Description
396 !% This variable selects the OpenCL platform that Octopus will
397 !% use. You can give an explicit platform number or use one of
398 !% the options that select a particular vendor
399 !% implementation. Platform 0 is used by default.
400 !%
401 !% This variable has no effect for CUDA.
402 !%Option amd -2
403 !% Use the AMD OpenCL platform.
404 !%Option nvidia -3
405 !% Use the Nvidia OpenCL platform.
406 !%Option ati -4
407 !% Use the ATI (old AMD) OpenCL platform.
408 !%Option intel -5
409 !% Use the Intel OpenCL platform.
410 !%End
411 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
412
413 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
414
415 !%Variable AccelDevice
416 !%Type integer
417 !%Default gpu
418 !%Section Execution::Accel
419 !%Description
420 !% This variable selects the OpenCL or CUDA accelerator device
421 !% that Octopus will use. You can specify one of the options below
422 !% or a numerical id to select a specific device.
423 !%
424 !% Values >= 0 select the device to be used. In case of MPI enabled runs
425 !% devices are distributed in a round robin fashion, starting at this value.
426 !%Option gpu -1
427 !% If available, Octopus will use a GPU.
428 !%Option cpu -2
429 !% If available, Octopus will use a CPU (only for OpenCL).
430 !%Option accelerator -3
431 !% If available, Octopus will use an accelerator (only for OpenCL).
432 !%Option accel_default -4
433 !% Octopus will use the default device specified by the implementation.
434 !% implementation.
435 !%End
436 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
437
438 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
439
440 if (idevice < opencl_default) then
441 call messages_write('Invalid AccelDevice')
442 call messages_fatal()
443 end if
444
445 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
446
447#ifdef HAVE_CUDA
448 if (idevice<0) idevice = 0
449 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
450 idevice, base_grp%rank)
451#ifdef HAVE_MPI
452 call loct_sysname(sys_name)
453 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
454 " on ", trim(sys_name)
455 call messages_info(1, all_nodes = .true.)
456#endif
457
458 ! no shared mem support in our cuda interface (for the moment)
459 accel%shared_mem = .true.
460
461 call cublas_init(accel%cublas_handle, accel%cuda_stream)
462#endif
463
464#ifdef HAVE_OPENCL
465 call profiling_in('CL_INIT')
466
467 call clgetplatformids(nplatforms, cl_status)
468 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
469
470 safe_allocate(allplatforms(1:nplatforms))
471
472 call clgetplatformids(allplatforms, iplat, cl_status)
473 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
474
475 call messages_write('Info: Available CL platforms: ')
476 call messages_write(nplatforms)
477 call messages_info()
478
479 do iplat = 1, nplatforms
480
481 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
482
483 if (iplatform < 0) then
484 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
485 end if
486
487 if (iplatform == iplat - 1) then
488 call messages_write(' * Platform ')
489 else
490 call messages_write(' Platform ')
491 end if
492
493 call messages_write(iplat - 1)
494 call messages_write(' : '//device_name)
495 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
496 call messages_write(' ('//trim(device_name)//')')
497 call messages_info()
498 end do
499
500 call messages_info()
501
502 if (iplatform >= nplatforms .or. iplatform < 0) then
503 call messages_write('Requested CL platform does not exist')
504 if (iplatform > 0) then
505 call messages_write('(platform = ')
506 call messages_write(iplatform)
507 call messages_write(').')
508 end if
509 call messages_fatal()
510 end if
511
512 platform_id = allplatforms(iplatform + 1)
513
514 safe_deallocate_a(allplatforms)
515
516 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
517
518 call messages_write('Info: Available CL devices: ')
519 call messages_write(ndevices)
520 call messages_info()
521
522 safe_allocate(alldevices(1:ndevices))
523
524 ! list all devices
525
526 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
527
528 do idev = 1, ndevices
529 call messages_write(' Device ')
530 call messages_write(idev - 1)
531 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
532 call messages_write(' : '//device_name)
533 call messages_info()
534 end do
535
536 select case (idevice)
537 case (opencl_gpu)
538 device_type = cl_device_type_gpu
539 case (opencl_cpu)
540 device_type = cl_device_type_cpu
541 case (opencl_accelerator)
542 device_type = cl_device_type_accelerator
543 case (opencl_default)
544 device_type = cl_device_type_default
545 case default
546 device_type = cl_device_type_all
547 end select
548
549 ! now get a list of the selected type
550 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
551
552 if (ret_devices < 1) then
553 ! we didnt find a device of the selected type, we ask for the default device
554 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
555
556 if (ret_devices < 1) then
557 ! if this does not work, we ask for all devices
558 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
559 end if
560
561 if (ret_devices < 1) then
562 call messages_write('Cannot find an OpenCL device')
563 call messages_fatal()
564 end if
565 end if
566
567 ! the number of devices can be smaller
568 ndevices = ret_devices
569
570 if (idevice < 0) then
571 if (base_grp%size > 1) then
572 ! with MPI we have to select the device so multiple GPUs in one
573 ! node are correctly distributed
574 call select_device(idevice)
575 else
576 idevice = 0
577 end if
578 end if
579
580 if (idevice >= ndevices) then
581 call messages_write('Requested CL device does not exist (device = ')
582 call messages_write(idevice)
583 call messages_write(', platform = ')
584 call messages_write(iplatform)
585 call messages_write(').')
586 call messages_fatal()
587 end if
588
589 accel%device%cl_device = alldevices(idevice + 1)
590
591 ! create the context
592 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
593 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
594
595 safe_deallocate_a(alldevices)
596
597 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
598 cl_queue_profiling_enable, cl_status)
599 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
600
601 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
602
603 select case (device_type)
604 case (cl_device_type_gpu)
605 accel%shared_mem = .true.
606 case (cl_device_type_cpu, cl_device_type_accelerator)
607 accel%shared_mem = .false.
608 case default
609 accel%shared_mem = .false.
610 end select
611
612#ifdef HAVE_CLBLAS
613 call clblassetup(cl_status)
614 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
615#endif
616
617#ifdef HAVE_CLFFT
618 call clfftsetup(cl_status)
619 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
620#endif
621
622 call profiling_out('CL_INIT')
623#endif
624
625 ! Get some device information that we will need later
626
627 ! total memory
628#ifdef HAVE_OPENCL
629 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
630 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
631 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
632 accel%warp_size = 1
633#endif
634#ifdef HAVE_CUDA
635 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
636 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
637 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
638 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
639 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
640 accel%max_block_dim(1) = int(dim, int64)
641 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
642 accel%max_block_dim(2) = int(dim, int64)
643 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
644 accel%max_block_dim(3) = int(dim, int64)
645 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
646 accel%max_grid_dim(1) = int(dim, int64)
647 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
648 accel%max_grid_dim(2) = int(dim, int64)
649 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
650 accel%max_grid_dim(3) = int(dim, int64)
651#endif
652
653 if (mpi_grp_is_root(base_grp)) call device_info()
654
655 ! initialize the cache used to speed up allocations
656 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
657
658 ! now initialize the kernels
660
661#if defined(HAVE_HIP)
662 accel%debug_flag = "-g"
663#elif defined(HAVE_CUDA)
664 accel%debug_flag = "-lineinfo"
665#elif defined(HAVE_OPENCL)
666 accel%debug_flag = "-g"
667#endif
668
669 call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
670 call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
671 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
672 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
673 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
674 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
675 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
676 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
677 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
678 flags = ' -DRTYPE_DOUBLE')
679 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
680 flags = '-DRTYPE_COMPLEX')
681 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
682 flags = '-DRTYPE_DOUBLE')
683 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
684 flags = '-DRTYPE_COMPLEX')
685 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
686 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
687 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
688 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
689 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
690 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
691 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
692 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
693 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
694 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
695 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
696 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
697 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
698 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
699 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
700
701
702 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
703 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
704
705 !%Variable AccelBenchmark
706 !%Type logical
707 !%Default no
708 !%Section Execution::Accel
709 !%Description
710 !% If this variable is set to yes, Octopus will run some
711 !% routines to benchmark the performance of the accelerator device.
712 !%End
713 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
714
715 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
716
717 if (run_benchmark) then
719 end if
720
721 !%Variable GPUAwareMPI
722 !%Type logical
723 !%Section Execution::Accel
724 !%Description
725 !% If Octopus was compiled with GPU support and MPI support and if the MPI
726 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
727 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
728 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
729 !% going through the host memory.
730 !% The default is false, except when the configure switch --enable-cudampi is set, in which
731 !% case this variable is set to true.
732 !%End
733#ifdef HAVE_CUDA_MPI
734 default = .true.
735#else
736 default = .false.
737#endif
738 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
739 if (accel%cuda_mpi) then
740#ifndef HAVE_CUDA_MPI
741 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
742 call messages_warning()
743#endif
744 call messages_write("Using GPU-aware MPI.")
745 call messages_info()
746 end if
747
748
749 !%Variable AllowCPUonly
750 !%Type logical
751 !%Section Execution::Accel
752 !%Description
753 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
754 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
755 !% code execution also in these cases.
756 !%End
757#if defined (HAVE_ACCEL)
758 default = .false.
759#else
760 default = .true.
761#endif
762 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
763
764
765 !%Variable InitializeGPUBuffers
766 !%Type logical
767 !%Section Execution::Accel
768 !%Description
769 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
770 !%End
771 call parse_variable(namespace, 'InitializeGPUBuffers', .false., accel%initialize_buffers)
772
773
774 call messages_print_with_emphasis(namespace=namespace)
775
776 pop_sub(accel_init)
777
778 contains
779
780#if defined(HAVE_OPENCL)
781 subroutine select_device(idevice)
782 integer, intent(inout) :: idevice
783 integer :: irank
784 character(len=256) :: device_name
785
786 push_sub(accel_init.select_device)
787
788 idevice = mod(base_grp%rank, ndevices)
789
790 call base_grp%barrier()
791 call messages_write('Info: CL device distribution:')
792 call messages_info()
793 do irank = 0, base_grp%size - 1
794 if (irank == base_grp%rank) then
795 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
796 call messages_write(' MPI node ')
797 call messages_write(base_grp%rank)
798 call messages_write(' -> CL device ')
799 call messages_write(idevice)
800 call messages_write(' : '//device_name)
801 call messages_info(all_nodes = .true.)
802 end if
803 call base_grp%barrier()
804 end do
805
807 end subroutine select_device
808#endif
809
810 subroutine device_info()
811#ifdef HAVE_OPENCL
812 integer(int64) :: val
813#endif
814#ifdef HAVE_CUDA
815 integer :: version
816#endif
817 integer :: major, minor
818 character(len=256) :: val_str
819
820 push_sub(accel_init.device_info)
821
822 call messages_new_line()
823 call messages_write('Selected device:')
824 call messages_new_line()
825
826#ifdef HAVE_OPENCL
827 call messages_write(' Framework : OpenCL')
828#endif
829#ifdef HAVE_CUDA
830#ifdef __HIP_PLATFORM_AMD__
831 call messages_write(' Framework : ROCm')
832#else
833 call messages_write(' Framework : CUDA')
834#endif
835#endif
836 call messages_info()
837
838#ifdef HAVE_CUDA
839 call messages_write(' Device type : GPU', new_line = .true.)
840#ifdef __HIP_PLATFORM_AMD__
841 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
842#else
843 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
844#endif
845#endif
846
847#ifdef HAVE_OPENCL
848 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
849 call messages_write(' Device type :')
850 select case (int(val, int32))
851 case (cl_device_type_gpu)
852 call messages_write(' GPU')
853 case (cl_device_type_cpu)
854 call messages_write(' CPU')
855 case (cl_device_type_accelerator)
856 call messages_write(' accelerator')
857 end select
858 call messages_new_line()
860 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
861 call messages_write(' Device vendor : '//trim(val_str))
862 call messages_new_line()
863#endif
864
865#ifdef HAVE_OPENCL
866 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
867#endif
868#ifdef HAVE_CUDA
869 call cuda_device_name(accel%device%cuda_device, val_str)
870#endif
871 call messages_write(' Device name : '//trim(val_str))
872 call messages_new_line()
873
874#ifdef HAVE_CUDA
875 call cuda_device_capability(accel%device%cuda_device, major, minor)
876#endif
877 call messages_write(' Cuda capabilities :')
878 call messages_write(major, fmt = '(i2)')
879 call messages_write('.')
880 call messages_write(minor, fmt = '(i1)')
881 call messages_new_line()
882
883 ! VERSION
884#ifdef HAVE_OPENCL
885 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
886 call messages_write(' Driver version : '//trim(val_str))
887#endif
888#ifdef HAVE_CUDA
889 call cuda_driver_version(version)
890 call messages_write(' Driver version : ')
891 call messages_write(version)
892#endif
893 call messages_new_line()
894
895
896#ifdef HAVE_OPENCL
897 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
898 call messages_write(' Compute units :')
899 call messages_write(val)
900 call messages_new_line()
901
902 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
903 call messages_write(' Clock frequency :')
904 call messages_write(val)
905 call messages_write(' GHz')
906 call messages_new_line()
907#endif
908
909 call messages_write(' Device memory :')
910 call messages_write(accel%global_memory_size, units=unit_megabytes)
911 call messages_new_line()
912
913 call messages_write(' Local/shared memory :')
914 call messages_write(accel%local_memory_size, units=unit_kilobytes)
915 call messages_new_line()
916
917
918#ifdef HAVE_OPENCL
919 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
920 call messages_write(' Max alloc size :')
921 call messages_write(val, units = unit_megabytes)
922 call messages_new_line()
923
924 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
925 call messages_write(' Device cache :')
926 call messages_write(val, units = unit_kilobytes)
927 call messages_new_line()
928
929 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
930 call messages_write(' Constant memory :')
931 call messages_write(val, units = unit_kilobytes)
932 call messages_new_line()
933#endif
934
935 call messages_write(' Max. group/block size :')
936 call messages_write(accel%max_workgroup_size)
937 call messages_new_line()
938
939
940#ifdef HAVE_OPENCL
941 call messages_write(' Extension cl_khr_fp64 :')
942 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
943 call messages_new_line()
944
945 call messages_write(' Extension cl_amd_fp64 :')
946 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
947 call messages_new_line()
948
949 call messages_write(' Extension cl_khr_int64_base_atomics :')
950 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
951 call messages_new_line()
952
953#endif
954
955 call messages_info()
956
957
958 pop_sub(accel_init.device_info)
959 end subroutine device_info
960
961 end subroutine accel_init
962
963 ! ------------------------------------------
964#ifdef HAVE_OPENCL
965 integer function get_platform_id(platform_name) result(platform_id)
966 character(len=*), intent(in) :: platform_name
967
968 platform_id = cl_plat_invalid
969 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
970 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
971 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
972 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
973 end function get_platform_id
974#endif
975 ! ------------------------------------------
976
977 subroutine accel_end(namespace)
978 type(namespace_t), intent(in) :: namespace
979
980#ifdef HAVE_OPENCL
981 integer :: ierr
982#endif
983 integer(int64) :: hits, misses
984 real(real64) :: volume_hits, volume_misses
985 logical :: found
986 type(accel_mem_t) :: tmp
987
988 push_sub(accel_end)
989
990 if (accel_is_enabled()) then
991
992 do
993 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
994 if (.not. found) exit
995
996#ifdef HAVE_OPENCL
997 call clreleasememobject(tmp%mem, ierr)
998 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
999#endif
1000#ifdef HAVE_CUDA
1001 call cuda_mem_free(tmp%mem)
1002#endif
1003 end do
1004
1005 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1006
1007 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1008
1009 call messages_new_line()
1010 call messages_write(' Number of allocations =')
1011 call messages_write(hits + misses, new_line = .true.)
1012 call messages_write(' Volume of allocations =')
1013 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1014 new_line = .true.)
1015 call messages_write(' Hit ratio =')
1016 if (hits + misses > 0) then
1017 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1018 else
1019 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1020 end if
1021 call messages_write('%', new_line = .true.)
1022 call messages_write(' Volume hit ratio =')
1023 if (volume_hits + volume_misses > 0) then
1024 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1025 else
1026 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1027 end if
1028 call messages_write('%')
1029 call messages_new_line()
1030 call messages_info()
1031
1032 call messages_print_with_emphasis(namespace=namespace)
1033 end if
1034
1036
1037#ifdef HAVE_CLBLAS
1038 call clblasteardown()
1039#endif
1040
1041#ifdef HAVE_CLFFT
1042 call clfftteardown()
1043#endif
1044
1045 if (accel_is_enabled()) then
1046#ifdef HAVE_CUDA
1047 call cublas_end(accel%cublas_handle)
1048 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1049 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1050 end if
1051#endif
1052
1053#ifdef HAVE_OPENCL
1054 call clreleasecommandqueue(accel%command_queue, ierr)
1056 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1057 call clreleasecontext(accel%context%cl_context, cl_status)
1058#endif
1059
1060 if (buffer_alloc_count /= 0) then
1061 call messages_write('Accel:')
1062 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1063 call messages_write(' in ')
1064 call messages_write(buffer_alloc_count)
1065 call messages_write(' buffers were not deallocated.')
1066 call messages_fatal()
1067 end if
1068
1069 end if
1070
1071 pop_sub(accel_end)
1072 end subroutine accel_end
1073
1074 ! ------------------------------------------
1075
1076 integer(int64) function accel_padded_size_i8(nn) result(psize)
1077 integer(int64), intent(in) :: nn
1078
1079 integer(int64) :: modnn, bsize
1080
1081 psize = nn
1082
1083 if (accel_is_enabled()) then
1084
1085 bsize = accel_max_workgroup_size()
1086
1087 psize = nn
1088 modnn = mod(nn, bsize)
1089 if (modnn /= 0) psize = psize + bsize - modnn
1090
1091 end if
1092
1093 end function accel_padded_size_i8
1094
1095 ! ------------------------------------------
1096
1097 integer(int32) function accel_padded_size_i4(nn) result(psize)
1098 integer(int32), intent(in) :: nn
1099
1100 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1101
1102 end function accel_padded_size_i4
1103
1104 ! ------------------------------------------
1105
1106 subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
1107 type(accel_mem_t), intent(inout) :: this
1108 integer, intent(in) :: flags
1109 type(type_t), intent(in) :: type
1110 integer, intent(in) :: size
1111 logical, optional, intent(in) :: set_zero
1112
1113 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero)
1114 end subroutine accel_create_buffer_4
1115
1116 ! ------------------------------------------
1117
1118 subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
1119 type(accel_mem_t), intent(inout) :: this
1120 integer, intent(in) :: flags
1121 type(type_t), intent(in) :: type
1122 integer(int64), intent(in) :: size
1123 logical, optional, intent(in) :: set_zero
1124
1125 integer(int64) :: fsize
1126 logical :: found
1127#ifdef HAVE_OPENCL
1128 integer :: ierr
1129#endif
1130
1131 push_sub(accel_create_buffer_8)
1132
1133 this%type = type
1134 this%size = size
1135 this%flags = flags
1136 fsize = int(size, int64)*types_get_size(type)
1137 this%allocated = .true.
1138
1139 if (fsize > 0) then
1140
1141 call alloc_cache_get(memcache, fsize, found, this%mem)
1142
1143 if (.not. found) then
1144#ifdef HAVE_OPENCL
1145 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1146 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1147#endif
1148#ifdef HAVE_CUDA
1149 call cuda_mem_alloc(this%mem, fsize)
1150#endif
1151 end if
1152
1153 buffer_alloc_count = buffer_alloc_count + 1
1154 allocated_mem = allocated_mem + fsize
1155
1156 end if
1157
1158 if(optional_default(set_zero, accel%initialize_buffers)) then
1159 call accel_set_buffer_to_zero_i8(this, type, size)
1160 endif
1161
1162 pop_sub(accel_create_buffer_8)
1163 end subroutine accel_create_buffer_8
1164
1165 ! ------------------------------------------
1166
1167 subroutine accel_release_buffer(this)
1168 type(accel_mem_t), intent(inout) :: this
1169
1170#ifdef HAVE_OPENCL
1171 integer :: ierr
1172#endif
1173 logical :: put
1174 integer(int64) :: fsize
1176 push_sub(accel_release_buffer)
1177
1178 if (this%size > 0) then
1179
1180 fsize = int(this%size, int64)*types_get_size(this%type)
1181
1182 call alloc_cache_put(memcache, fsize, this%mem, put)
1183
1184 if (.not. put) then
1185#ifdef HAVE_OPENCL
1186 call clreleasememobject(this%mem, ierr)
1187 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1188#endif
1189#ifdef HAVE_CUDA
1190 call cuda_mem_free(this%mem)
1191#endif
1192 end if
1193
1194 buffer_alloc_count = buffer_alloc_count - 1
1195 allocated_mem = allocated_mem + fsize
1197 end if
1198
1199 this%size = 0
1200 this%flags = 0
1201
1202 this%allocated = .false.
1203
1204 pop_sub(accel_release_buffer)
1205 end subroutine accel_release_buffer
1206
1207 ! ------------------------------------------
1208
1209 logical pure function accel_buffer_is_allocated(this) result(allocated)
1210 type(accel_mem_t), intent(in) :: this
1211
1212 allocated = this%allocated
1213 end function accel_buffer_is_allocated
1214
1215 ! -----------------------------------------
1216
1217 subroutine accel_finish()
1218#ifdef HAVE_OPENCL
1219 integer :: ierr
1220#endif
1221
1222 ! no push_sub, called too frequently
1223
1224 if (accel_is_enabled()) then
1225#ifdef HAVE_OPENCL
1226 call clfinish(accel%command_queue, ierr)
1227 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1228#endif
1229#ifdef HAVE_CUDA
1231#endif
1232 end if
1233 end subroutine accel_finish
1234
1235 ! ------------------------------------------
1236
1237 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1238 type(accel_kernel_t), intent(inout) :: kernel
1239 integer, intent(in) :: narg
1240 type(accel_mem_t), intent(in) :: buffer
1241
1242#ifdef HAVE_OPENCL
1243 integer :: ierr
1244#endif
1246 assert(accel_buffer_is_allocated(buffer))
1247
1248 ! no push_sub, called too frequently
1249#ifdef HAVE_OPENCL
1250 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1251 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1252#endif
1253
1254#ifdef HAVE_CUDA
1255 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1256#endif
1257
1258 end subroutine accel_set_kernel_arg_buffer
1259
1260 ! ------------------------------------------
1261
1262 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1263 type(accel_kernel_t), intent(inout) :: kernel
1264 integer, intent(in) :: narg
1265 type(type_t), intent(in) :: type
1266 integer, intent(in) :: size
1267
1268#ifdef HAVE_OPENCL
1269 integer :: ierr
1270#endif
1271 integer(int64) :: size_in_bytes
1272
1274
1275
1276 size_in_bytes = int(size, int64)*types_get_size(type)
1277
1278 if (size_in_bytes > accel%local_memory_size) then
1279 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1280 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1281 call messages_fatal(2)
1282 else if (size_in_bytes <= 0) then
1283 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1284 call messages_fatal(1)
1285 end if
1286
1287#ifdef HAVE_CUDA
1288 kernel%cuda_shared_mem = size_in_bytes
1289#endif
1290
1291#ifdef HAVE_OPENCL
1292 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1293 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1294#endif
1297 end subroutine accel_set_kernel_arg_local
1298
1299 ! ------------------------------------------
1300
1301 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1302 type(accel_kernel_t), intent(inout) :: kernel
1303 integer(int64), intent(in) :: globalsizes(:)
1304 integer(int64), intent(in) :: localsizes(:)
1305
1306 integer :: dim
1307#ifdef HAVE_OPENCL
1308 integer :: ierr
1309#endif
1310 integer(int64) :: gsizes(1:3)
1311 integer(int64) :: lsizes(1:3)
1312
1313 ! no push_sub, called too frequently
1314
1315 ! cuda needs all dimensions
1316 gsizes = 1
1317 lsizes = 1
1318
1319 dim = ubound(globalsizes, dim=1)
1320
1321 assert(dim == ubound(localsizes, dim=1))
1322
1323 ! if one size is zero, there is nothing to do
1324 if (any(globalsizes == 0)) return
1325
1326 assert(all(localsizes > 0))
1327 assert(all(localsizes <= accel_max_workgroup_size()))
1328 assert(all(mod(globalsizes, localsizes) == 0))
1329
1330 gsizes(1:dim) = globalsizes(1:dim)
1331 lsizes(1:dim) = localsizes(1:dim)
1332
1333#ifdef HAVE_OPENCL
1334 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1335 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1336#endif
1337
1338#ifdef HAVE_CUDA
1339 ! Maximum dimension of a block
1340 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1341 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1342 message(2) = "The following conditions should be fulfilled:"
1343 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1344 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1345 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1346 message(6) = "This is an internal error, please contact the developers."
1347 call messages_fatal(6)
1348 end if
1349
1350
1351 ! Maximum number of threads per block
1352 if (product(lsizes) > accel_max_workgroup_size()) then
1353 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1354 message(2) = "The following condition should be fulfilled:"
1355 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1356 message(4) = "This is an internal error, please contact the developers."
1357 call messages_fatal(4)
1358 end if
1359
1360 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1361
1362 ! Maximum dimensions of the grid of thread block
1363 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1364 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1365 message(2) = "The following conditions should be fulfilled:"
1366 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1367 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1368 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1369 message(6) = "This is an internal error, please contact the developers."
1370 call messages_fatal(6)
1371 end if
1372
1373 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1374
1375 kernel%cuda_shared_mem = 0
1376#endif
1377
1378 end subroutine accel_kernel_run_8
1380 ! -----------------------------------------------
1381
1382 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1383 type(accel_kernel_t), intent(inout) :: kernel
1384 integer, intent(in) :: globalsizes(:)
1385 integer, intent(in) :: localsizes(:)
1386
1387 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1388
1389 end subroutine accel_kernel_run_4
1390
1391 ! -----------------------------------------------
1392
1393 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1394 max_workgroup_size = accel%max_workgroup_size
1395 end function accel_max_workgroup_size
1396
1397 ! -----------------------------------------------
1398
1399 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1400 type(accel_kernel_t), intent(inout) :: kernel
1401
1402#ifdef HAVE_OPENCL
1403 integer(int64) :: workgroup_size8
1404 integer :: ierr
1405#endif
1406#ifdef HAVE_CUDA
1407 integer :: max_workgroup_size
1408#endif
1409
1410 workgroup_size = 0
1411
1412#ifdef HAVE_OPENCL
1413 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1414 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1415 workgroup_size = workgroup_size8
1416#endif
1417
1418#ifdef HAVE_CUDA
1419 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1420 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1421 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1422 "than available on the device (", accel%max_workgroup_size, ")"
1423 call messages_info(1)
1424 end if
1425 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1426 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1427 workgroup_size = 256
1428 ! make sure we do not use more threads per block than available for this kernel
1429 workgroup_size = min(workgroup_size, max_workgroup_size)
1430#endif
1431
1432 end function accel_kernel_workgroup_size
1433
1434 ! -----------------------------------------------
1435
1436#ifdef HAVE_OPENCL
1437 subroutine opencl_build_program(prog, filename, flags)
1438 type(cl_program), intent(inout) :: prog
1439 character(len=*), intent(in) :: filename
1440 character(len=*), optional, intent(in) :: flags
1441
1442 character(len = 1000) :: string
1443 character(len = 256) :: share_string
1444 integer :: ierr, ierrlog, iunit, irec, newlen
1445
1446 push_sub(opencl_build_program)
1447
1448 string = '#include "'//trim(filename)//'"'
1449
1450 if (debug%info) then
1451 call messages_write("Building CL program '"//trim(filename)//"'.")
1452 call messages_info()
1453 end if
1454
1455 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1456 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1457
1458 ! build the compilation flags
1459 string='-w'
1460 ! full optimization
1461 string=trim(string)//' -cl-denorms-are-zero'
1462 ! The following flag gives an error with the Xeon Phi
1463 ! string=trim(string)//' -cl-strict-aliasing'
1464 string=trim(string)//' -cl-mad-enable'
1465 string=trim(string)//' -cl-unsafe-math-optimizations'
1466 string=trim(string)//' -cl-finite-math-only'
1467 string=trim(string)//' -cl-fast-relaxed-math'
1468
1469 share_string='-I'//trim(conf%share)//'/opencl/'
1470
1471 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1472 string = trim(string)//' -DEXT_KHR_FP64'
1473 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1474 string = trim(string)//' -DEXT_AMD_FP64'
1475 else
1476 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1477 call messages_fatal()
1478 end if
1479
1480 if (accel_use_shared_mem()) then
1481 string = trim(string)//' -DSHARED_MEM'
1482 end if
1483
1484 if (present(flags)) then
1485 string = trim(string)//' '//trim(flags)
1486 end if
1487
1488 if (debug%info) then
1489 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1490 call messages_write(' '//trim(share_string)//"'.")
1491 call messages_info()
1492 end if
1493
1494 string = trim(string)//' '//trim(share_string)
1495
1496 call clbuildprogram(prog, trim(string), ierr)
1497
1498 if(ierr /= cl_success) then
1499 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1500 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1501
1502 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1503 newlen = scan(string, achar(010), back = .true.) - 1
1504 if (newlen >= 0) string = string(1:newlen)
1505
1506 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1507
1508 call opencl_print_error(ierr, "clBuildProgram")
1509 end if
1510
1511 pop_sub(opencl_build_program)
1512 end subroutine opencl_build_program
1513#endif
1514
1515 ! -----------------------------------------------
1516#ifdef HAVE_OPENCL
1517 subroutine opencl_release_program(prog)
1518 type(cl_program), intent(inout) :: prog
1519
1520 integer :: ierr
1521
1522 push_sub(opencl_release_program)
1523
1524 call clreleaseprogram(prog, ierr)
1525 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1526
1527 pop_sub(opencl_release_program)
1528 end subroutine opencl_release_program
1529#endif
1530
1531 ! -----------------------------------------------
1532
1533#ifdef HAVE_OPENCL
1534 subroutine opencl_release_kernel(prog)
1535 type(cl_kernel), intent(inout) :: prog
1536
1537 integer :: ierr
1538
1539 push_sub(opencl_release_kernel)
1540
1541#ifdef HAVE_OPENCL
1542 call clreleasekernel(prog, ierr)
1543 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1544#endif
1545
1546 pop_sub(opencl_release_kernel)
1547 end subroutine opencl_release_kernel
1548#endif
1549
1550#ifdef HAVE_OPENCL
1551 ! -----------------------------------------------
1552 subroutine opencl_create_kernel(kernel, prog, name)
1553 type(cl_kernel), intent(inout) :: kernel
1554 type(cl_program), intent(inout) :: prog
1555 character(len=*), intent(in) :: name
1556
1557 integer :: ierr
1558
1559 push_sub(opencl_create_kernel)
1560 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1561
1562#ifdef HAVE_OPENCL
1563 kernel = clcreatekernel(prog, name, ierr)
1564 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1565#endif
1566
1567 call profiling_out("CL_BUILD_KERNEL")
1568 pop_sub(opencl_create_kernel)
1569 end subroutine opencl_create_kernel
1570#endif
1571
1572 ! ------------------------------------------------
1573#ifdef HAVE_OPENCL
1574 subroutine opencl_print_error(ierr, name)
1575 integer, intent(in) :: ierr
1576 character(len=*), intent(in) :: name
1577
1578 character(len=40) :: errcode
1579
1580 push_sub(opencl_print_error)
1581
1582 select case (ierr)
1583 case (cl_success); errcode = 'CL_SUCCESS '
1584 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1585 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1586 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1587 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1588 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1589 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1590 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1591 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1592 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1593 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1594 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1595 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1596 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1597 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1598 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1599 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1600 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1601 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1602 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1603 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1604 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1605 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1606 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1607 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1608 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1609 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1610 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1611 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1612 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1613 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1614 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1615 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1616 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1617 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1618 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1619 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1620 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1621 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1622 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1623 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1624 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1625 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1626 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1627 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1628 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1629 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1630 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1631 case default
1632 write(errcode, '(i10)') ierr
1633 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1634 end select
1635
1636 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1637 call messages_fatal(1)
1638
1639 pop_sub(opencl_print_error)
1640 end subroutine opencl_print_error
1641#endif
1642
1643 ! ----------------------------------------------------
1644
1645 subroutine clblas_print_error(ierr, name)
1646 integer, intent(in) :: ierr
1647 character(len=*), intent(in) :: name
1648
1649 character(len=40) :: errcode
1650
1651 push_sub(clblas_print_error)
1652#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1653 select case (ierr)
1654 case (clblassuccess); errcode = 'clblasSuccess'
1655 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1656 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1657 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1658 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1659 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1660 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1661 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1662 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1663 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1664 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1665 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1666 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1667 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1668 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1669 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1670 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1671 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1672 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1673 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1674 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1675 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1676 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1677 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1678 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1679 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1680 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1681 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1682 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1683 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1684#ifdef HAVE_CLBLAST
1685 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1686 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1687 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1688 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1689 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1690 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1691 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1692 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1693 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1694 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1695 case (clblastunknownerror); errcode = 'clblastUnknownError'
1696 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1697#endif
1698
1699 case default
1700 write(errcode, '(i10)') ierr
1701 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1702 end select
1703#endif
1704
1705 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1706 call messages_fatal(1)
1707
1708 pop_sub(clblas_print_error)
1709 end subroutine clblas_print_error
1710
1711 ! ----------------------------------------------------
1712 subroutine clfft_print_error(ierr, name)
1713 integer, intent(in) :: ierr
1714 character(len=*), intent(in) :: name
1715
1716 character(len=40) :: errcode
1717
1718 push_sub(clfft_print_error)
1719#ifdef HAVE_CLFFT
1720 select case (ierr)
1721 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1722 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1723 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1724 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1725 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1726 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1727 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1728 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1729 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1730 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1731 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1732 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1733 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1734 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1735 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1736 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1737 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1738 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1739 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1740 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1741 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1742 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1743 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1744 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1745 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1746 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1747 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1748 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1749 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1750 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1751 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1752 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1753 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1754 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1755 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1756 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1757 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1758 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1759 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1760 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1761 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1762 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1763 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1764 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1765 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1766 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1767 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1768 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1769 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1770 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1771 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1772 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1773 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1774 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1775 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1776 case default
1777 write(errcode, '(i10)') ierr
1778 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1779 end select
1780#endif
1781
1782 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1783 call messages_fatal(1)
1784
1785 pop_sub(clfft_print_error)
1786 end subroutine clfft_print_error
1787
1788 ! ----------------------------------------------------
1789
1790#ifdef HAVE_OPENCL
1791 logical function f90_cl_device_has_extension(device, extension) result(has)
1792 type(cl_device_id), intent(inout) :: device
1793 character(len=*), intent(in) :: extension
1794
1795 integer :: cl_status
1796 character(len=2048) :: all_extensions
1797
1798#ifdef HAVE_OPENCL
1799 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1800#endif
1801
1802 has = index(all_extensions, extension) /= 0
1803
1804 end function f90_cl_device_has_extension
1805#endif
1806
1807 ! ---------------------------------------------------------
1808
1809 integer(int64) pure function opencl_pad(size, blk) result(pad)
1810 integer(int64), intent(in) :: size
1811 integer, intent(in) :: blk
1812
1813 integer(int64) :: mm
1814
1815 mm = mod(size, blk)
1816 if (mm == 0) then
1817 pad = size
1818 else
1819 pad = size + blk - mm
1820 end if
1821 end function opencl_pad
1822
1823 ! ----------------------------------------------------
1824
1825 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1826 type(accel_mem_t), intent(inout) :: buffer
1827 type(type_t), intent(in) :: type
1828 integer(int64), intent(in) :: nval
1829 integer(int64), optional, intent(in) :: offset
1830 logical, optional, intent(in) :: async
1831
1832 integer :: bsize
1833 integer(int64) :: nval_real, offset_real
1834
1836
1837 if (nval > 0) then
1838
1839 nval_real = nval
1840 if (type == type_cmplx) nval_real = nval_real * 2
1841 if (present(offset)) then
1842 offset_real = offset
1843 if (type == type_cmplx) offset_real = offset_real * 2
1844 else
1845 offset_real = 0_int64
1846 end if
1847
1848 assert(nval_real > 0)
1849
1850 call accel_set_kernel_arg(set_zero, 0, nval_real)
1851 call accel_set_kernel_arg(set_zero, 1, offset_real)
1852 call accel_set_kernel_arg(set_zero, 2, buffer)
1853
1855
1856 call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ int(bsize, int64) /))
1857
1858 if(.not. optional_default(async, .false.)) call accel_finish()
1859
1860 end if
1861
1863 end subroutine accel_set_buffer_to_zero_i8
1864
1865 ! ----------------------------------------------------
1866
1867 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1868 type(accel_mem_t), intent(inout) :: buffer
1869 type(type_t), intent(in) :: type
1870 integer(int32), intent(in) :: nval
1871 integer(int32), optional, intent(in) :: offset
1872 logical, optional, intent(in) :: async
1873
1875
1876 if (present(offset)) then
1877 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1878 else
1879 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1880 end if
1881
1883 end subroutine accel_set_buffer_to_zero_i4
1884
1885 ! ----------------------------------------------------
1886
1888 integer :: itime
1889 integer, parameter :: times = 10
1890 integer :: size
1891 real(real64) :: time, stime
1892 real(real64) :: read_bw, write_bw
1893 type(accel_mem_t) :: buff
1894 real(real64), allocatable :: data(:)
1895
1896 call messages_new_line()
1897 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1898 call messages_new_line()
1899 call messages_info()
1900
1901 call messages_write(' Buffer size Read bw Write bw')
1902 call messages_new_line()
1903 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1904 call messages_info()
1905
1906 size = 15000
1907 do
1908 safe_allocate(data(1:size))
1909 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1910
1911 stime = loct_clock()
1912 do itime = 1, times
1913 call accel_write_buffer(buff, size, data)
1914 call accel_finish()
1915 end do
1916 time = (loct_clock() - stime)/real(times, real64)
1917
1918 write_bw = real(size, real64) *8.0_real64/time
1919
1920 stime = loct_clock()
1921 do itime = 1, times
1922 call accel_read_buffer(buff, size, data)
1923 end do
1924 call accel_finish()
1925
1926 time = (loct_clock() - stime)/real(times, real64)
1927 read_bw = real(size, real64) *8.0_real64/time
1928
1929 call messages_write(size*8.0_real64/1024.0_real64**2)
1930 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1931 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1932 call messages_info()
1933
1934 call accel_release_buffer(buff)
1935
1936 safe_deallocate_a(data)
1937
1938 size = int(size*2.0)
1939
1940 if (size > 50000000) exit
1941 end do
1942 end subroutine opencl_check_bandwidth
1943
1944 ! ----------------------------------------------------
1946 logical pure function accel_use_shared_mem() result(use_shared_mem)
1947
1948 use_shared_mem = accel%shared_mem
1949
1950 end function accel_use_shared_mem
1951
1952 !------------------------------------------------------------
1953
1954 subroutine accel_kernel_global_init()
1955
1956 push_sub(accel_kernel_global_init)
1957
1958 nullify(head)
1959
1960 call cuda_module_map_init(accel%module_map)
1961
1963 end subroutine accel_kernel_global_init
1964
1965 !------------------------------------------------------------
1966
1967 subroutine accel_kernel_global_end()
1968 type(accel_kernel_t), pointer :: next_head
1969
1970 push_sub(accel_kernel_global_end)
1971
1972 do
1973 if (.not. associated(head)) exit
1974 next_head => head%next
1976 head => next_head
1977 end do
1978
1979 if (accel_is_enabled()) then
1980 call cuda_module_map_end(accel%module_map)
1981 end if
1982
1984 end subroutine accel_kernel_global_end
1985
1986 !------------------------------------------------------------
1987
1988 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1989 type(accel_kernel_t), intent(inout) :: this
1990 character(len=*), intent(in) :: file_name
1991 character(len=*), intent(in) :: kernel_name
1992 character(len=*), optional, intent(in) :: flags
1993
1994#ifdef HAVE_OPENCL
1995 type(cl_program) :: prog
1996#endif
1997#ifdef HAVE_CUDA
1998 character(len=1000) :: all_flags
1999#endif
2000
2001 push_sub(accel_kernel_build)
2002
2003 call profiling_in("ACCEL_COMPILE", exclude = .true.)
2004
2005#ifdef HAVE_CUDA
2006 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
2007
2008 if (accel_use_shared_mem()) then
2009 all_flags = trim(all_flags)//' -DSHARED_MEM'
2010 end if
2011
2012 if (present(flags)) then
2013 all_flags = trim(all_flags)//' '//trim(flags)
2014 end if
2015
2016 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
2017
2018 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2019 call cuda_alloc_arg_array(this%arguments)
2020
2021 this%cuda_shared_mem = 0
2022#endif
2023
2024#ifdef HAVE_OPENCL
2025 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2026 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2027 call opencl_release_program(prog)
2028#endif
2029
2030 this%initialized = .true.
2031 this%kernel_name = trim(kernel_name)
2033 call profiling_out("ACCEL_COMPILE")
2034
2035 pop_sub(accel_kernel_build)
2036 end subroutine accel_kernel_build
2037
2038 !------------------------------------------------------------
2039
2040 subroutine accel_kernel_end(this)
2041 type(accel_kernel_t), intent(inout) :: this
2042#ifdef HAVE_OPENCL
2043 integer :: ierr
2044#endif
2046 push_sub(accel_kernel_end)
2047
2048#ifdef HAVE_CUDA
2049 call cuda_free_arg_array(this%arguments)
2050 call cuda_release_kernel(this%cuda_kernel)
2051 ! modules are not released here, since they are not associated to a kernel
2052#endif
2053
2054#ifdef HAVE_OPENCL
2055 call clreleasekernel(this%kernel, ierr)
2056 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2057#endif
2058 this%initialized = .false.
2059
2060 pop_sub(accel_kernel_end)
2061 end subroutine accel_kernel_end
2062
2063 !------------------------------------------------------------
2064
2065 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2066 type(accel_kernel_t), target, intent(inout) :: this
2067 character(len=*), intent(in) :: file_name
2068 character(len=*), intent(in) :: kernel_name
2069 character(len=*), optional, intent(in) :: flags
2070
2071 push_sub(accel_kernel_start_call)
2072
2073 if (.not. this%initialized) then
2074 call accel_kernel_build(this, file_name, kernel_name, flags)
2075 this%next => head
2076 head => this
2077 end if
2078
2080 end subroutine accel_kernel_start_call
2081
2082 !--------------------------------------------------------------
2083
2084 integer(int64) pure function accel_global_memory_size() result(size)
2085
2086 size = accel%global_memory_size
2087
2088 end function accel_global_memory_size
2089
2090 !--------------------------------------------------------------
2091
2092 integer(int64) pure function accel_local_memory_size() result(size)
2093
2094 size = accel%local_memory_size
2095
2096 end function accel_local_memory_size
2097
2098 !--------------------------------------------------------------
2099
2100 integer pure function accel_max_size_per_dim(dim) result(size)
2101 integer, intent(in) :: dim
2102
2103 size = 0
2104#ifdef HAVE_OPENCL
2105 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2106 ! value of zero when multiplied by 2048 and converted to integer 4.
2107 if (dim == 1) size = 2**30
2108#endif
2109#ifdef HAVE_CUDA
2110 size = 32768
2111 if (dim == 1) size = 2**30
2112#endif
2113 end function accel_max_size_per_dim
2114
2115 ! ------------------------------------------------------
2116
2117 subroutine accel_set_stream(stream_number)
2118 integer, intent(in) :: stream_number
2119
2120 push_sub(accel_set_stream)
2121
2122 if (accel_is_enabled()) then
2123#ifdef HAVE_CUDA
2124 call cuda_set_stream(accel%cuda_stream, stream_number)
2125 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2126#endif
2127 end if
2128
2129 pop_sub(accel_set_stream)
2130 end subroutine accel_set_stream
2131
2132 ! ------------------------------------------------------
2133
2134 subroutine accel_get_stream(stream_number)
2135 integer, intent(inout) :: stream_number
2136
2137 push_sub(accel_get_stream)
2138
2139 if (accel_is_enabled()) then
2140#ifdef HAVE_CUDA
2141 call cuda_get_stream(stream_number)
2142#endif
2143 end if
2144
2145 pop_sub(accel_get_stream)
2146 end subroutine accel_get_stream
2147
2148 ! ------------------------------------------------------
2149
2152
2153 if (accel_is_enabled()) then
2154#ifdef HAVE_CUDA
2155 call cuda_synchronize_all_streams()
2156#endif
2157 end if
2158
2160 end subroutine accel_synchronize_all_streams
2161
2162 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2163 type(c_ptr), intent(in) :: buffer
2164 integer(int64), intent(in) :: offset
2165 type(c_ptr) :: buffer_offset
2166
2168#ifdef HAVE_CUDA
2169 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2170#else
2171 ! this is needed to make the compiler happy for non-GPU compilations
2172 buffer_offset = buffer
2173#endif
2176
2177 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2178 type(c_ptr), intent(in) :: buffer
2179 integer(int64), intent(in) :: offset
2180 type(c_ptr) :: buffer_offset
2181
2183#ifdef HAVE_CUDA
2184 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2185#else
2186 ! this is needed to make the compiler happy for non-GPU compilations
2187 buffer_offset = buffer
2188#endif
2191
2192 subroutine accel_clean_pointer(buffer)
2193 type(c_ptr), intent(in) :: buffer
2194
2196#ifdef HAVE_CUDA
2197 call cuda_clean_pointer(buffer)
2198#endif
2199 pop_sub(accel_clean_pointer)
2200 end subroutine accel_clean_pointer
2201
2205 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2206 integer(int64), intent(in) :: size
2207 integer(int64), intent(out) :: grid_size
2208 integer(int64), intent(out) :: thread_block_size
2209
2210 push_sub(accel_get_unfolded_size)
2211#ifdef __HIP_PLATFORM_AMD__
2212 ! not benefitial for AMD chips
2213 grid_size = size
2214 thread_block_size = size
2215#else
2216 grid_size = size * accel%warp_size
2217 thread_block_size = accel%warp_size
2218#endif
2220 end subroutine accel_get_unfolded_size
2221
2222#include "undef.F90"
2223#include "real.F90"
2224#include "accel_inc.F90"
2225
2226#include "undef.F90"
2227#include "complex.F90"
2228#include "accel_inc.F90"
2229
2230#include "undef.F90"
2231#include "integer.F90"
2232#include "accel_inc.F90"
2233
2234#include "undef.F90"
2235#include "integer8.F90"
2236#include "accel_inc.F90"
2237
2238end module accel_oct_m
2239
2240!! Local Variables:
2241!! mode: f90
2242!! coding: utf-8
2243!! End:
subroutine select_device(idevice)
Definition: accel.F90:860
subroutine device_info()
Definition: accel.F90:889
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4694
integer, parameter opencl_accelerator
Definition: accel.F90:375
type(accel_kernel_t), target, save, public kernel_density_real
Definition: accel.F90:285
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3461
integer, parameter opencl_default
Definition: accel.F90:375
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:274
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:293
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:2271
subroutine accel_kernel_global_end()
Definition: accel.F90:2046
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:2284
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:410
subroutine iaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3853
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2765
subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
Definition: accel.F90:1197
subroutine laccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4439
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:2025
subroutine laccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4537
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3370
subroutine daccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2485
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:297
subroutine laccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4260
subroutine zaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3147
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
Definition: accel.F90:275
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4581
subroutine daccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2382
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2974
type(accel_kernel_t), target, save, public kernel_ghost_reorder
Definition: accel.F90:284
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4088
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4605
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:294
integer, parameter cl_plat_nvidia
Definition: accel.F90:382
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:2144
subroutine iaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3733
subroutine daccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2643
integer, parameter cl_plat_ati
Definition: accel.F90:382
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:2213
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:2163
subroutine daccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2683
subroutine iaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3814
subroutine daccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2542
subroutine zaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3049
subroutine zaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3109
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:296
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3345
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2878
subroutine laccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4480
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1341
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2856
subroutine zaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3090
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:1155
subroutine iaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3912
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4043
subroutine, public accel_finish()
Definition: accel.F90:1296
subroutine opencl_check_bandwidth()
Definition: accel.F90:1966
subroutine accel_kernel_global_init()
Definition: accel.F90:2033
subroutine daccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2425
type(accel_kernel_t), target, save, public kernel_daxpy
Definition: accel.F90:277
subroutine laccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4377
subroutine opencl_release_program(prog)
Definition: accel.F90:1596
type(accel_kernel_t), save set_zero
Definition: accel.F90:304
subroutine zaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3248
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3975
subroutine laccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4280
type(accel_kernel_t), target, save, public zzmul
Definition: accel.F90:300
integer, parameter cl_plat_invalid
Definition: accel.F90:382
type(accel_kernel_t), target, save, public kernel_density_spinors
Definition: accel.F90:287
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4556
subroutine daccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2702
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4185
subroutine daccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2444
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2789
integer(int64) pure function opencl_pad(size, blk)
Definition: accel.F90:1888
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1904
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3438
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1288
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine zaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3269
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1791
subroutine accel_kernel_end(this)
Definition: accel.F90:2119
subroutine iaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3695
integer, parameter opencl_gpu
Definition: accel.F90:375
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:295
subroutine iaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3833
subroutine opencl_release_kernel(prog)
Definition: accel.F90:1613
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3394
subroutine zaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2987
subroutine zaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3288
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2241
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3579
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3999
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:2179
type(accel_kernel_t), target, save, public dzmul
Definition: accel.F90:299
subroutine iaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3752
subroutine laccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4241
subroutine daccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2604
subroutine, public accel_release_buffer(this)
Definition: accel.F90:1246
type(accel_kernel_t), target, save, public kernel_phase_spiral
Definition: accel.F90:289
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4672
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4021
subroutine zaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3030
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2740
subroutine zaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3307
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1380
subroutine laccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4320
subroutine laccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4459
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
Definition: accel.F90:276
subroutine opencl_build_program(prog, filename, flags)
Definition: accel.F90:1516
subroutine laccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4339
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:2067
subroutine iaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3654
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:420
subroutine, public accel_end(namespace)
Definition: accel.F90:1056
subroutine iaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3771
subroutine opencl_create_kernel(kernel, prog, name)
Definition: accel.F90:1631
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:2229
subroutine iaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3893
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:2196
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:283
type(accel_kernel_t), target, save, public kernel_phase
Definition: accel.F90:288
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4649
subroutine zaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3326
subroutine laccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4518
type(accel_kernel_t), target, save, public kernel_zaxpy
Definition: accel.F90:278
subroutine laccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4499
integer, parameter cl_plat_amd
Definition: accel.F90:382
subroutine zaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3228
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:1176
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1946
subroutine daccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2523
subroutine iaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3714
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:298
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4066
subroutine daccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2504
subroutine iaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3592
pure logical function, public accel_is_enabled()
Definition: accel.F90:400
subroutine zaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3166
integer, parameter cl_plat_intel
Definition: accel.F90:382
integer, parameter, public accel_mem_write_only
Definition: accel.F90:183
subroutine daccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2721
subroutine daccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2664
subroutine laccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4301
subroutine zaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3209
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:273
logical function f90_cl_device_has_extension(device, extension)
Definition: accel.F90:1870
subroutine opencl_print_error(ierr, name)
Definition: accel.F90:1653
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2833
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1461
subroutine zaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3128
subroutine iaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3635
subroutine laccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4198
type(accel_kernel_t), target, save, public kernel_density_complex
Definition: accel.F90:286
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3416
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2256
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:2369
subroutine daccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2561
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1478
subroutine laccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4420
integer, parameter opencl_cpu
Definition: accel.F90:375
subroutine zaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3069
integer function get_platform_id(platform_name)
Definition: accel.F90:1044
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1724
type(accel_t), public accel
Definition: accel.F90:270
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4627
subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
Definition: accel.F90:1185
subroutine laccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4358
subroutine daccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2623
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3950
subroutine daccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2464
integer, public cl_status
Definition: accel.F90:390
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3483
subroutine iaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3874
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:282
subroutine iaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3674
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:2171
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1316
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1472
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2811
type(accel_kernel_t), pointer head
Definition: accel.F90:394
subroutine iaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3931
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
integer, parameter, public clblassuccess
Definition: clblas.F90:193
real(real64), parameter, public m_zero
Definition: global.F90:187
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:930
character(len=512), private msg
Definition: messages.F90:165
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:543
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1057
subroutine, public messages_info(no_lines, iunit, verbose_limit, stress, all_nodes, namespace)
Definition: messages.F90:624
subroutine, public messages_new_line()
Definition: messages.F90:1146
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:420
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
Definition: mpi.F90:430
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:623
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:552
integer pure function, public types_get_size(this)
Definition: types.F90:152
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)