Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
31#ifdef HAVE_OPENCL
32 use cl
33#endif
34#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
35 use clblas_oct_m
36#endif
37 use cuda_oct_m
38#ifdef HAVE_CLFFT
39 use clfft
40#endif
41 use debug_oct_m
42 use global_oct_m
43 use iso_c_binding, only: c_size_t
44 use, intrinsic :: iso_fortran_env
45 use loct_oct_m
47 use mpi_oct_m
49 use types_oct_m
50 use parser_oct_m
53
54 implicit none
55
56 private
57
58 public :: &
63 accel_t, &
66 accel_init, &
67 accel_end, &
96
97#ifdef HAVE_OPENCL
98 integer, public, parameter :: &
99 ACCEL_MEM_READ_ONLY = cl_mem_read_only, &
100 accel_mem_read_write = cl_mem_read_write, &
101 accel_mem_write_only = cl_mem_write_only
102#else
103 integer, public, parameter :: &
104 ACCEL_MEM_READ_ONLY = 0, &
107#endif
108
109 type accel_context_t
110 ! Components are public by default
111#ifdef HAVE_OPENCL
112 type(cl_context) :: cl_context
113#elif defined(HAVE_CUDA)
114 type(c_ptr) :: cuda_context
115#else
116 integer :: dummy
117#endif
118 end type accel_context_t
119
120 type accel_device_t
121 ! Components are public by default
122#ifdef HAVE_OPENCL
123 type(cl_device_id) :: cl_device
124#elif defined(HAVE_CUDA)
125 type(c_ptr) :: cuda_device
126#else
127 integer :: dummy
128#endif
129 end type accel_device_t
130
131 type accel_t
132 ! Components are public by default
133 type(accel_context_t) :: context
134 type(accel_device_t) :: device
135#ifdef HAVE_OPENCL
136 type(cl_command_queue) :: command_queue
137#endif
138 type(c_ptr) :: cublas_handle
139 type(c_ptr) :: cuda_stream
140 type(c_ptr) :: module_map
141 integer :: max_workgroup_size
142 integer(int64) :: local_memory_size
143 integer(int64) :: global_memory_size
144 logical :: enabled
145 logical :: allow_CPU_only
146 logical :: shared_mem
147 logical :: cuda_mpi
148 integer :: warp_size
149 logical :: initialize_buffers
150 character(len=32) :: debug_flag
151 integer(int64) :: max_block_dim(3)
152 integer(int64) :: max_grid_dim(3)
153 end type accel_t
154
155 type accel_mem_t
156 ! Components are public by default
157#ifdef HAVE_OPENCL
158 type(cl_mem) :: mem
159#else
160 type(c_ptr) :: mem
161#endif
162 integer(c_size_t) :: size = 0
163 type(type_t) :: type
164 integer :: flags = 0
165 logical :: allocated = .false.
166 end type accel_mem_t
167
168 type accel_kernel_t
169 ! Components are public by default
170#ifdef HAVE_OPENCL
171 type(cl_kernel) :: kernel
172#endif
173#ifdef HAVE_CUDA
174 type(c_ptr) :: cuda_kernel
175 type(c_ptr) :: cuda_module
176 type(c_ptr) :: arguments
177#endif
178 integer(int64) :: cuda_shared_mem
179 logical :: initialized = .false.
180 type(accel_kernel_t), pointer :: next
181 integer :: arg_count
182 character(len=128) :: kernel_name
184
185 type(accel_t), public :: accel
186
187 ! the kernels
188 type(accel_kernel_t), public, target, save :: kernel_vpsi
189 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
190 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
191 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
192 type(accel_kernel_t), public, target, save :: kernel_daxpy
193 type(accel_kernel_t), public, target, save :: kernel_zaxpy
194 type(accel_kernel_t), public, target, save :: kernel_copy
195 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
196 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
197 type(accel_kernel_t), public, target, save :: dpack
198 type(accel_kernel_t), public, target, save :: zpack
199 type(accel_kernel_t), public, target, save :: dunpack
200 type(accel_kernel_t), public, target, save :: zunpack
201 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
202 type(accel_kernel_t), public, target, save :: kernel_density_real
203 type(accel_kernel_t), public, target, save :: kernel_density_complex
204 type(accel_kernel_t), public, target, save :: kernel_density_spinors
205 type(accel_kernel_t), public, target, save :: kernel_phase
206 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
207 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
208 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
209 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
210 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
211 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
212 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
213 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
214 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
215 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
216 type(accel_kernel_t), public, target, save :: dzmul
217 type(accel_kernel_t), public, target, save :: zzmul
218 type(accel_kernel_t), public, target, save :: set_one
220 ! kernels used locally
221 type(accel_kernel_t), target, save :: set_zero
222 type(accel_kernel_t), target, save :: set_zero_int
226 end interface accel_padded_size
234 end interface accel_kernel_run
238 end interface accel_set_buffer_to_zero
239
254 end interface accel_write_buffer
255
265 end interface accel_read_buffer
268 module procedure &
293 integer, parameter :: &
300 integer, parameter :: &
304 cl_plat_ati = -4, &
305 cl_plat_intel = -5
307 ! a "convenience" public variable
308 integer, public :: cl_status
310 integer :: buffer_alloc_count
311 integer(int64) :: allocated_mem
312 type(accel_kernel_t), pointer :: head
313 type(alloc_cache_t) :: memcache
314
315contains
316
317 pure logical function accel_is_enabled() result(enabled)
318#ifdef HAVE_ACCEL
319 enabled = accel%enabled
320#else
321 enabled = .false.
322#endif
323 end function accel_is_enabled
324
325 ! ------------------------------------------
326
327 pure logical function accel_allow_cpu_only() result(allow)
328#ifdef HAVE_ACCEL
329 allow = accel%allow_CPU_only
330#else
331 allow = .true.
332#endif
333 end function accel_allow_cpu_only
334
335 ! ------------------------------------------
336
337 subroutine accel_init(base_grp, namespace)
338 type(mpi_grp_t), intent(inout) :: base_grp
339 type(namespace_t), intent(in) :: namespace
340
341 logical :: disable, default, run_benchmark
342 integer :: idevice, iplatform
343#ifdef HAVE_OPENCL
344 integer :: device_type
345 integer :: cl_status, idev
346 integer :: ndevices, ret_devices, nplatforms, iplat
347 character(len=256) :: device_name
348 type(cl_platform_id) :: platform_id
349 type(cl_program) :: prog
350 type(cl_platform_id), allocatable :: allplatforms(:)
351 type(cl_device_id), allocatable :: alldevices(:)
352#endif
353#ifdef HAVE_CUDA
354 integer :: dim
355#ifdef HAVE_MPI
356 character(len=256) :: sys_name
357#endif
358#endif
359
360 push_sub(accel_init)
361
362 buffer_alloc_count = 0
363
364 !%Variable DisableAccel
365 !%Type logical
366 !%Default yes
367 !%Section Execution::Accel
368 !%Description
369 !% If Octopus was compiled with OpenCL or CUDA support, it will
370 !% try to initialize and use an accelerator device. By setting this
371 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
372 !%End
373 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
374#ifdef HAVE_ACCEL
375 default = .false.
376#else
377 default = .true.
378#endif
379 call parse_variable(namespace, 'DisableAccel', default, disable)
380 accel%enabled = .not. disable
381
382#ifndef HAVE_ACCEL
383 if (accel%enabled) then
384 message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
386 end if
387#endif
388
389 if (.not. accel_is_enabled()) then
390 pop_sub(accel_init)
391 return
392 end if
394 !%Variable AccelPlatform
395 !%Type integer
396 !%Default 0
397 !%Section Execution::Accel
398 !%Description
399 !% This variable selects the OpenCL platform that Octopus will
400 !% use. You can give an explicit platform number or use one of
401 !% the options that select a particular vendor
402 !% implementation. Platform 0 is used by default.
403 !%
404 !% This variable has no effect for CUDA.
405 !%Option amd -2
406 !% Use the AMD OpenCL platform.
407 !%Option nvidia -3
408 !% Use the Nvidia OpenCL platform.
409 !%Option ati -4
410 !% Use the ATI (old AMD) OpenCL platform.
411 !%Option intel -5
412 !% Use the Intel OpenCL platform.
413 !%End
414 call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
415
416 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
417
418 !%Variable AccelDevice
419 !%Type integer
420 !%Default gpu
421 !%Section Execution::Accel
422 !%Description
423 !% This variable selects the OpenCL or CUDA accelerator device
424 !% that Octopus will use. You can specify one of the options below
425 !% or a numerical id to select a specific device.
426 !%
427 !% Values >= 0 select the device to be used. In case of MPI enabled runs
428 !% devices are distributed in a round robin fashion, starting at this value.
429 !%Option gpu -1
430 !% If available, Octopus will use a GPU.
431 !%Option cpu -2
432 !% If available, Octopus will use a CPU (only for OpenCL).
433 !%Option accelerator -3
434 !% If available, Octopus will use an accelerator (only for OpenCL).
435 !%Option accel_default -4
436 !% Octopus will use the default device specified by the implementation.
437 !% implementation.
438 !%End
439 call parse_variable(namespace, 'AccelDevice', opencl_gpu, idevice)
440
441 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
442
443 if (idevice < opencl_default) then
444 call messages_write('Invalid AccelDevice')
445 call messages_fatal()
446 end if
447
448 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
449
450#ifdef HAVE_CUDA
451 if (idevice<0) idevice = 0
452 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
453 idevice, base_grp%rank)
454#ifdef HAVE_MPI
455 call loct_sysname(sys_name)
456 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
457 " on ", trim(sys_name)
458 call messages_info(1, all_nodes = .true.)
459#endif
460
461 ! no shared mem support in our cuda interface (for the moment)
462 accel%shared_mem = .true.
463
464 call cublas_init(accel%cublas_handle, accel%cuda_stream)
465#endif
466
467#ifdef HAVE_OPENCL
468 call profiling_in('CL_INIT')
469
470 call clgetplatformids(nplatforms, cl_status)
471 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
472
473 safe_allocate(allplatforms(1:nplatforms))
474
475 call clgetplatformids(allplatforms, iplat, cl_status)
476 if (cl_status /= cl_success) call opencl_print_error(cl_status, "GetPlatformIDs")
477
478 call messages_write('Info: Available CL platforms: ')
479 call messages_write(nplatforms)
480 call messages_info()
481
482 do iplat = 1, nplatforms
483
484 call clgetplatforminfo(allplatforms(iplat), cl_platform_name, device_name, cl_status)
485
486 if (iplatform < 0) then
487 if (iplatform == get_platform_id(device_name)) iplatform = iplat - 1
488 end if
489
490 if (iplatform == iplat - 1) then
491 call messages_write(' * Platform ')
492 else
493 call messages_write(' Platform ')
494 end if
495
496 call messages_write(iplat - 1)
497 call messages_write(' : '//device_name)
498 call clgetplatforminfo(allplatforms(iplat), cl_platform_version, device_name, cl_status)
499 call messages_write(' ('//trim(device_name)//')')
500 call messages_info()
501 end do
502
503 call messages_info()
504
505 if (iplatform >= nplatforms .or. iplatform < 0) then
506 call messages_write('Requested CL platform does not exist')
507 if (iplatform > 0) then
508 call messages_write('(platform = ')
509 call messages_write(iplatform)
510 call messages_write(').')
511 end if
512 call messages_fatal()
513 end if
514
515 platform_id = allplatforms(iplatform + 1)
516
517 safe_deallocate_a(allplatforms)
518
519 call clgetdeviceids(platform_id, cl_device_type_all, ndevices, cl_status)
520
521 call messages_write('Info: Available CL devices: ')
522 call messages_write(ndevices)
523 call messages_info()
524
525 safe_allocate(alldevices(1:ndevices))
526
527 ! list all devices
528
529 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
530
531 do idev = 1, ndevices
532 call messages_write(' Device ')
533 call messages_write(idev - 1)
534 call clgetdeviceinfo(alldevices(idev), cl_device_name, device_name, cl_status)
535 call messages_write(' : '//device_name)
536 call messages_info()
537 end do
538
539 select case (idevice)
540 case (opencl_gpu)
541 device_type = cl_device_type_gpu
542 case (opencl_cpu)
543 device_type = cl_device_type_cpu
544 case (opencl_accelerator)
545 device_type = cl_device_type_accelerator
546 case (opencl_default)
547 device_type = cl_device_type_default
548 case default
549 device_type = cl_device_type_all
550 end select
551
552 ! now get a list of the selected type
553 call clgetdeviceids(platform_id, device_type, alldevices, ret_devices, cl_status)
554
555 if (ret_devices < 1) then
556 ! we didnt find a device of the selected type, we ask for the default device
557 call clgetdeviceids(platform_id, cl_device_type_default, alldevices, ret_devices, cl_status)
558
559 if (ret_devices < 1) then
560 ! if this does not work, we ask for all devices
561 call clgetdeviceids(platform_id, cl_device_type_all, alldevices, ret_devices, cl_status)
562 end if
563
564 if (ret_devices < 1) then
565 call messages_write('Cannot find an OpenCL device')
566 call messages_fatal()
567 end if
568 end if
569
570 ! the number of devices can be smaller
571 ndevices = ret_devices
572
573 if (idevice < 0) then
574 if (base_grp%size > 1) then
575 ! with MPI we have to select the device so multiple GPUs in one
576 ! node are correctly distributed
577 call select_device(idevice)
578 else
579 idevice = 0
580 end if
581 end if
582
583 if (idevice >= ndevices) then
584 call messages_write('Requested CL device does not exist (device = ')
585 call messages_write(idevice)
586 call messages_write(', platform = ')
587 call messages_write(iplatform)
588 call messages_write(').')
589 call messages_fatal()
590 end if
591
592 accel%device%cl_device = alldevices(idevice + 1)
593
594 ! create the context
595 accel%context%cl_context = clcreatecontext(platform_id, accel%device%cl_device, cl_status)
596 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateContext")
597
598 safe_deallocate_a(alldevices)
599
600 accel%command_queue = clcreatecommandqueue(accel%context%cl_context, accel%device%cl_device, &
601 cl_queue_profiling_enable, cl_status)
602 if (cl_status /= cl_success) call opencl_print_error(cl_status, "CreateCommandQueue")
603
604 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, device_type, cl_status)
605
606 select case (device_type)
607 case (cl_device_type_gpu)
608 accel%shared_mem = .true.
609 case (cl_device_type_cpu, cl_device_type_accelerator)
610 accel%shared_mem = .false.
611 case default
612 accel%shared_mem = .false.
613 end select
614
615#ifdef HAVE_CLBLAS
616 call clblassetup(cl_status)
617 if (cl_status /= clblassuccess) call clblas_print_error(cl_status, 'clblasSetup')
618#endif
619
620#ifdef HAVE_CLFFT
621 call clfftsetup(cl_status)
622 if (cl_status /= clfft_success) call clfft_print_error(cl_status, 'clfftSetup')
623#endif
624
625 call profiling_out('CL_INIT')
626#endif
627
628 ! Get some device information that we will need later
629
630 ! total memory
631#ifdef HAVE_OPENCL
632 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_size, accel%global_memory_size, cl_status)
633 call clgetdeviceinfo(accel%device%cl_device, cl_device_local_mem_size, accel%local_memory_size, cl_status)
634 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_work_group_size, accel%max_workgroup_size, cl_status)
635 accel%warp_size = 1
636#endif
637#ifdef HAVE_CUDA
638 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
639 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
640 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
641 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
642 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
643 accel%max_block_dim(1) = int(dim, int64)
644 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
645 accel%max_block_dim(2) = int(dim, int64)
646 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
647 accel%max_block_dim(3) = int(dim, int64)
648 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
649 accel%max_grid_dim(1) = int(dim, int64)
650 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
651 accel%max_grid_dim(2) = int(dim, int64)
652 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
653 accel%max_grid_dim(3) = int(dim, int64)
654#endif
655
656 if (mpi_grp_is_root(base_grp)) call device_info()
657
658 ! initialize the cache used to speed up allocations
659 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
660
661 ! now initialize the kernels
663
664#if defined(HAVE_HIP)
665 accel%debug_flag = "-g"
666#elif defined(HAVE_CUDA)
667 accel%debug_flag = "-lineinfo"
668#elif defined(HAVE_OPENCL)
669 accel%debug_flag = "-g"
670#endif
671
672 call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
673 call accel_kernel_start_call(set_zero_int, 'set_zero.cl', "set_zero_int")
674 call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
675 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
676 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cl', "vpsi_complex")
677 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
678 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cl', "vpsi_spinors_complex")
679 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
680 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
681 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", &
682 flags = ' -DRTYPE_DOUBLE')
683 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", &
684 flags = '-DRTYPE_COMPLEX')
685 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cl', "dbatch_ax_function_py", &
686 flags = '-DRTYPE_DOUBLE')
687 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cl', "zbatch_ax_function_py", &
688 flags = '-DRTYPE_COMPLEX')
689 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp")
690 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp")
691 call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
692 call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
693 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
694 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
695 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
696 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cl', "copy_complex_to_real")
697 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cl', "copy_real_to_complex")
698 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cl', "ghost_reorder")
699 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
700 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
701 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
702 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
703 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
704 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
705 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
706
707
708 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
709 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
710
711 !%Variable AccelBenchmark
712 !%Type logical
713 !%Default no
714 !%Section Execution::Accel
715 !%Description
716 !% If this variable is set to yes, Octopus will run some
717 !% routines to benchmark the performance of the accelerator device.
718 !%End
719 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
720
721 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
722
723 if (run_benchmark) then
725 end if
726
727 !%Variable GPUAwareMPI
728 !%Type logical
729 !%Section Execution::Accel
730 !%Description
731 !% If Octopus was compiled with GPU support and MPI support and if the MPI
732 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
733 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
734 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
735 !% going through the host memory.
736 !% The default is false, except when the configure switch --enable-cudampi is set, in which
737 !% case this variable is set to true.
738 !%End
739#ifdef HAVE_CUDA_MPI
740 default = .true.
741#else
742 default = .false.
743#endif
744 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
745 if (accel%cuda_mpi) then
746#ifndef HAVE_CUDA_MPI
747 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
748 call messages_warning()
749#endif
750 call messages_write("Using GPU-aware MPI.")
751 call messages_info()
752 end if
753
754
755 !%Variable AllowCPUonly
756 !%Type logical
757 !%Section Execution::Accel
758 !%Description
759 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
760 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
761 !% code execution also in these cases.
762 !%End
763#if defined (HAVE_ACCEL)
764 default = .false.
765#else
766 default = .true.
767#endif
768 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
769
770
771 !%Variable InitializeGPUBuffers
772 !%Type logical
773 !%Section Execution::Accel
774 !%Description
775 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
776 !%End
777 call parse_variable(namespace, 'InitializeGPUBuffers', .false., accel%initialize_buffers)
778
779
780 call messages_print_with_emphasis(namespace=namespace)
781
782 pop_sub(accel_init)
783
784 contains
785
786#if defined(HAVE_OPENCL)
787 subroutine select_device(idevice)
788 integer, intent(inout) :: idevice
789 integer :: irank
790 character(len=256) :: device_name
791
792 push_sub(accel_init.select_device)
793
794 idevice = mod(base_grp%rank, ndevices)
795
796 call base_grp%barrier()
797 call messages_write('Info: CL device distribution:')
798 call messages_info()
799 do irank = 0, base_grp%size - 1
800 if (irank == base_grp%rank) then
801 call clgetdeviceinfo(alldevices(idevice + 1), cl_device_name, device_name, cl_status)
802 call messages_write(' MPI node ')
803 call messages_write(base_grp%rank)
804 call messages_write(' -> CL device ')
805 call messages_write(idevice)
806 call messages_write(' : '//device_name)
807 call messages_info(all_nodes = .true.)
808 end if
809 call base_grp%barrier()
810 end do
811
813 end subroutine select_device
814#endif
815
816 subroutine device_info()
817#ifdef HAVE_OPENCL
818 integer(int64) :: val
819#endif
820#ifdef HAVE_CUDA
821 integer :: version
822#endif
823 integer :: major, minor
824 character(len=256) :: val_str
825
826 push_sub(accel_init.device_info)
827
828 call messages_new_line()
829 call messages_write('Selected device:')
830 call messages_new_line()
831
832#ifdef HAVE_OPENCL
833 call messages_write(' Framework : OpenCL')
834#endif
835#ifdef HAVE_CUDA
836#ifdef __HIP_PLATFORM_AMD__
837 call messages_write(' Framework : ROCm')
838#else
839 call messages_write(' Framework : CUDA')
840#endif
841#endif
842 call messages_info()
843
844#ifdef HAVE_CUDA
845 call messages_write(' Device type : GPU', new_line = .true.)
846#ifdef __HIP_PLATFORM_AMD__
847 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
848#else
849 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
850#endif
851#endif
852
853#ifdef HAVE_OPENCL
854 call clgetdeviceinfo(accel%device%cl_device, cl_device_type, val, cl_status)
855 call messages_write(' Device type :')
856 select case (int(val, int32))
857 case (cl_device_type_gpu)
858 call messages_write(' GPU')
859 case (cl_device_type_cpu)
860 call messages_write(' CPU')
861 case (cl_device_type_accelerator)
862 call messages_write(' accelerator')
863 end select
864 call messages_new_line()
866 call clgetdeviceinfo(accel%device%cl_device, cl_device_vendor, val_str, cl_status)
867 call messages_write(' Device vendor : '//trim(val_str))
868 call messages_new_line()
869#endif
870
871#ifdef HAVE_OPENCL
872 call clgetdeviceinfo(accel%device%cl_device, cl_device_name, val_str, cl_status)
873#endif
874#ifdef HAVE_CUDA
875 call cuda_device_name(accel%device%cuda_device, val_str)
876#endif
877 call messages_write(' Device name : '//trim(val_str))
878 call messages_new_line()
879
880#ifdef HAVE_CUDA
881 call cuda_device_capability(accel%device%cuda_device, major, minor)
882#endif
883 call messages_write(' Cuda capabilities :')
884 call messages_write(major, fmt = '(i2)')
885 call messages_write('.')
886 call messages_write(minor, fmt = '(i1)')
887 call messages_new_line()
888
889 ! VERSION
890#ifdef HAVE_OPENCL
891 call clgetdeviceinfo(accel%device%cl_device, cl_driver_version, val_str, cl_status)
892 call messages_write(' Driver version : '//trim(val_str))
893#endif
894#ifdef HAVE_CUDA
895 call cuda_driver_version(version)
896 call messages_write(' Driver version : ')
897 call messages_write(version)
898#endif
899 call messages_new_line()
900
901
902#ifdef HAVE_OPENCL
903 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_compute_units, val, cl_status)
904 call messages_write(' Compute units :')
905 call messages_write(val)
906 call messages_new_line()
907
908 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_clock_frequency, val, cl_status)
909 call messages_write(' Clock frequency :')
910 call messages_write(val)
911 call messages_write(' GHz')
912 call messages_new_line()
913#endif
914
915 call messages_write(' Device memory :')
916 call messages_write(accel%global_memory_size, units=unit_megabytes)
917 call messages_new_line()
918
919 call messages_write(' Local/shared memory :')
920 call messages_write(accel%local_memory_size, units=unit_kilobytes)
921 call messages_new_line()
922
923
924#ifdef HAVE_OPENCL
925 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_mem_alloc_size, val, cl_status)
926 call messages_write(' Max alloc size :')
927 call messages_write(val, units = unit_megabytes)
928 call messages_new_line()
929
930 call clgetdeviceinfo(accel%device%cl_device, cl_device_global_mem_cache_size, val, cl_status)
931 call messages_write(' Device cache :')
932 call messages_write(val, units = unit_kilobytes)
933 call messages_new_line()
934
935 call clgetdeviceinfo(accel%device%cl_device, cl_device_max_constant_buffer_size, val, cl_status)
936 call messages_write(' Constant memory :')
937 call messages_write(val, units = unit_kilobytes)
938 call messages_new_line()
939#endif
940
941 call messages_write(' Max. group/block size :')
942 call messages_write(accel%max_workgroup_size)
943 call messages_new_line()
944
945
946#ifdef HAVE_OPENCL
947 call messages_write(' Extension cl_khr_fp64 :')
948 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
949 call messages_new_line()
950
951 call messages_write(' Extension cl_amd_fp64 :')
952 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
953 call messages_new_line()
954
955 call messages_write(' Extension cl_khr_int64_base_atomics :')
956 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_int64_base_atomics"))
957 call messages_new_line()
958
959#endif
960
961 call messages_info()
962
963
964 pop_sub(accel_init.device_info)
965 end subroutine device_info
966
967 end subroutine accel_init
968
969 ! ------------------------------------------
970#ifdef HAVE_OPENCL
971 integer function get_platform_id(platform_name) result(platform_id)
972 character(len=*), intent(in) :: platform_name
973
974 platform_id = cl_plat_invalid
975 if (index(platform_name, 'AMD') > 0) platform_id = cl_plat_amd
976 if (index(platform_name, 'ATI') > 0) platform_id = cl_plat_ati
977 if (index(platform_name, 'NVIDIA') > 0) platform_id = cl_plat_nvidia
978 if (index(platform_name, 'Intel') > 0) platform_id = cl_plat_intel
979 end function get_platform_id
980#endif
981 ! ------------------------------------------
982
983 subroutine accel_end(namespace)
984 type(namespace_t), intent(in) :: namespace
985
986#ifdef HAVE_OPENCL
987 integer :: ierr
988#endif
989 integer(int64) :: hits, misses
990 real(real64) :: volume_hits, volume_misses
991 logical :: found
992 type(accel_mem_t) :: tmp
993
994 push_sub(accel_end)
995
996 if (accel_is_enabled()) then
997
998 do
999 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
1000 if (.not. found) exit
1001
1002#ifdef HAVE_OPENCL
1003 call clreleasememobject(tmp%mem, ierr)
1004 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1005#endif
1006#ifdef HAVE_CUDA
1007 call cuda_mem_free(tmp%mem)
1008#endif
1009 end do
1010
1011 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
1012
1013 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
1014
1015 call messages_new_line()
1016 call messages_write(' Number of allocations =')
1017 call messages_write(hits + misses, new_line = .true.)
1018 call messages_write(' Volume of allocations =')
1019 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
1020 new_line = .true.)
1021 call messages_write(' Hit ratio =')
1022 if (hits + misses > 0) then
1023 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
1024 else
1025 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1026 end if
1027 call messages_write('%', new_line = .true.)
1028 call messages_write(' Volume hit ratio =')
1029 if (volume_hits + volume_misses > 0) then
1030 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
1031 else
1032 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
1033 end if
1034 call messages_write('%')
1035 call messages_new_line()
1036 call messages_info()
1037
1038 call messages_print_with_emphasis(namespace=namespace)
1039 end if
1040
1042
1043#ifdef HAVE_CLBLAS
1044 call clblasteardown()
1045#endif
1046
1047#ifdef HAVE_CLFFT
1048 call clfftteardown()
1049#endif
1050
1051 if (accel_is_enabled()) then
1052#ifdef HAVE_CUDA
1053 call cublas_end(accel%cublas_handle)
1054 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
1055 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
1056 end if
1057#endif
1058
1059#ifdef HAVE_OPENCL
1060 call clreleasecommandqueue(accel%command_queue, ierr)
1062 if (ierr /= cl_success) call opencl_print_error(ierr, "ReleaseCommandQueue")
1063 call clreleasecontext(accel%context%cl_context, cl_status)
1064#endif
1065
1066 if (buffer_alloc_count /= 0) then
1067 call messages_write('Accel:')
1068 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
1069 call messages_write(' in ')
1070 call messages_write(buffer_alloc_count)
1071 call messages_write(' buffers were not deallocated.')
1072 call messages_fatal()
1073 end if
1074
1075 end if
1076
1077 pop_sub(accel_end)
1078 end subroutine accel_end
1079
1080 ! ------------------------------------------
1081
1082 integer(int64) function accel_padded_size_i8(nn) result(psize)
1083 integer(int64), intent(in) :: nn
1084
1085 integer(int64) :: modnn, bsize
1086
1087 psize = nn
1088
1089 if (accel_is_enabled()) then
1090
1091 bsize = accel_max_workgroup_size()
1092
1093 psize = nn
1094 modnn = mod(nn, bsize)
1095 if (modnn /= 0) psize = psize + bsize - modnn
1096
1097 end if
1098
1099 end function accel_padded_size_i8
1100
1101 ! ------------------------------------------
1102
1103 integer(int32) function accel_padded_size_i4(nn) result(psize)
1104 integer(int32), intent(in) :: nn
1105
1106 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
1107
1108 end function accel_padded_size_i4
1109
1110 ! ------------------------------------------
1111
1112 subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
1113 type(accel_mem_t), intent(inout) :: this
1114 integer, intent(in) :: flags
1115 type(type_t), intent(in) :: type
1116 integer, intent(in) :: size
1117 logical, optional, intent(in) :: set_zero
1118
1119 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero)
1120 end subroutine accel_create_buffer_4
1121
1122 ! ------------------------------------------
1123
1124 subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
1125 type(accel_mem_t), intent(inout) :: this
1126 integer, intent(in) :: flags
1127 type(type_t), intent(in) :: type
1128 integer(int64), intent(in) :: size
1129 logical, optional, intent(in) :: set_zero
1130
1131 integer(int64) :: fsize
1132 logical :: found
1133#ifdef HAVE_OPENCL
1134 integer :: ierr
1135#endif
1136
1137 push_sub(accel_create_buffer_8)
1138
1139 this%type = type
1140 this%size = size
1141 this%flags = flags
1142 fsize = int(size, int64)*types_get_size(type)
1143 this%allocated = .true.
1144
1145 if (fsize > 0) then
1146
1147 call alloc_cache_get(memcache, fsize, found, this%mem)
1148
1149 if (.not. found) then
1150#ifdef HAVE_OPENCL
1151 this%mem = clcreatebuffer(accel%context%cl_context, flags, fsize, ierr)
1152 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateBuffer")
1153#endif
1154#ifdef HAVE_CUDA
1155 call cuda_mem_alloc(this%mem, fsize)
1156#endif
1157 end if
1158
1159 buffer_alloc_count = buffer_alloc_count + 1
1160 allocated_mem = allocated_mem + fsize
1161
1162 end if
1163
1164 if(optional_default(set_zero, accel%initialize_buffers)) then
1165 call accel_set_buffer_to_zero_i8(this, type, size)
1166 endif
1167
1168 pop_sub(accel_create_buffer_8)
1169 end subroutine accel_create_buffer_8
1170
1171 ! ------------------------------------------
1172
1173 subroutine accel_release_buffer(this)
1174 type(accel_mem_t), intent(inout) :: this
1175
1176#ifdef HAVE_OPENCL
1177 integer :: ierr
1178#endif
1179 logical :: put
1180 integer(int64) :: fsize
1182 push_sub(accel_release_buffer)
1183
1184 if (this%size > 0) then
1185
1186 fsize = int(this%size, int64)*types_get_size(this%type)
1187
1188 call alloc_cache_put(memcache, fsize, this%mem, put)
1189
1190 if (.not. put) then
1191#ifdef HAVE_OPENCL
1192 call clreleasememobject(this%mem, ierr)
1193 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseMemObject")
1194#endif
1195#ifdef HAVE_CUDA
1196 call cuda_mem_free(this%mem)
1197#endif
1198 end if
1199
1200 buffer_alloc_count = buffer_alloc_count - 1
1201 allocated_mem = allocated_mem + fsize
1203 end if
1204
1205 this%size = 0
1206 this%flags = 0
1207
1208 this%allocated = .false.
1209
1210 pop_sub(accel_release_buffer)
1211 end subroutine accel_release_buffer
1212
1213 ! ------------------------------------------
1214
1215 logical pure function accel_buffer_is_allocated(this) result(allocated)
1216 type(accel_mem_t), intent(in) :: this
1217
1218 allocated = this%allocated
1219 end function accel_buffer_is_allocated
1220
1221 ! -----------------------------------------
1222
1223 subroutine accel_finish()
1224#ifdef HAVE_OPENCL
1225 integer :: ierr
1226#endif
1227
1228 ! no push_sub, called too frequently
1229
1230 if (accel_is_enabled()) then
1231#ifdef HAVE_OPENCL
1232 call clfinish(accel%command_queue, ierr)
1233 if (ierr /= cl_success) call opencl_print_error(ierr, 'clFinish')
1234#endif
1235#ifdef HAVE_CUDA
1237#endif
1238 end if
1239 end subroutine accel_finish
1240
1241 ! ------------------------------------------
1242
1243 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1244 type(accel_kernel_t), intent(inout) :: kernel
1245 integer, intent(in) :: narg
1246 type(accel_mem_t), intent(in) :: buffer
1247
1248#ifdef HAVE_OPENCL
1249 integer :: ierr
1250#endif
1252 assert(accel_buffer_is_allocated(buffer))
1253
1254 ! no push_sub, called too frequently
1255#ifdef HAVE_OPENCL
1256 call clsetkernelarg(kernel%kernel, narg, buffer%mem, ierr)
1257 if (ierr /= cl_success) call opencl_print_error(ierr, "clSetKernelArg_buf")
1258#endif
1259
1260#ifdef HAVE_CUDA
1261 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1262#endif
1263
1264 end subroutine accel_set_kernel_arg_buffer
1265
1266 ! ------------------------------------------
1267
1268 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1269 type(accel_kernel_t), intent(inout) :: kernel
1270 integer, intent(in) :: narg
1271 type(type_t), intent(in) :: type
1272 integer, intent(in) :: size
1273
1274#ifdef HAVE_OPENCL
1275 integer :: ierr
1276#endif
1277 integer(int64) :: size_in_bytes
1278
1280
1281
1282 size_in_bytes = int(size, int64)*types_get_size(type)
1283
1284 if (size_in_bytes > accel%local_memory_size) then
1285 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
1286 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
1287 call messages_fatal(2)
1288 else if (size_in_bytes <= 0) then
1289 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1290 call messages_fatal(1)
1291 end if
1292
1293#ifdef HAVE_CUDA
1294 kernel%cuda_shared_mem = size_in_bytes
1295#endif
1296
1297#ifdef HAVE_OPENCL
1298 call clsetkernelarglocal(kernel%kernel, narg, size_in_bytes, ierr)
1299 if (ierr /= cl_success) call opencl_print_error(ierr, "set_kernel_arg_local")
1300#endif
1303 end subroutine accel_set_kernel_arg_local
1304
1305 ! ------------------------------------------
1306
1307 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
1308 type(accel_kernel_t), intent(inout) :: kernel
1309 integer(int64), intent(in) :: globalsizes(:)
1310 integer(int64), intent(in) :: localsizes(:)
1311
1312 integer :: dim
1313#ifdef HAVE_OPENCL
1314 integer :: ierr
1315#endif
1316 integer(int64) :: gsizes(1:3)
1317 integer(int64) :: lsizes(1:3)
1318
1319 ! no push_sub, called too frequently
1320
1321 ! cuda needs all dimensions
1322 gsizes = 1
1323 lsizes = 1
1324
1325 dim = ubound(globalsizes, dim=1)
1326
1327 assert(dim == ubound(localsizes, dim=1))
1328
1329 ! if one size is zero, there is nothing to do
1330 if (any(globalsizes == 0)) return
1331
1332 assert(all(localsizes > 0))
1333 assert(all(localsizes <= accel_max_workgroup_size()))
1334 assert(all(mod(globalsizes, localsizes) == 0))
1335
1336 gsizes(1:dim) = globalsizes(1:dim)
1337 lsizes(1:dim) = localsizes(1:dim)
1338
1339#ifdef HAVE_OPENCL
1340 call clenqueuendrangekernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1341 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1342#endif
1343
1344#ifdef HAVE_CUDA
1345 ! Maximum dimension of a block
1346 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
1347 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
1348 message(2) = "The following conditions should be fulfilled:"
1349 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
1350 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
1351 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
1352 message(6) = "This is an internal error, please contact the developers."
1353 call messages_fatal(6)
1354 end if
1355
1356
1357 ! Maximum number of threads per block
1358 if (product(lsizes) > accel_max_workgroup_size()) then
1359 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1360 message(2) = "The following condition should be fulfilled:"
1361 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1362 message(4) = "This is an internal error, please contact the developers."
1363 call messages_fatal(4)
1364 end if
1365
1366 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1367
1368 ! Maximum dimensions of the grid of thread block
1369 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1370 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1371 message(2) = "The following conditions should be fulfilled:"
1372 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1373 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1374 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1375 message(6) = "This is an internal error, please contact the developers."
1376 call messages_fatal(6)
1377 end if
1378
1379 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1380
1381 kernel%cuda_shared_mem = 0
1382#endif
1383
1384 end subroutine accel_kernel_run_8
1386 ! -----------------------------------------------
1387
1388 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1389 type(accel_kernel_t), intent(inout) :: kernel
1390 integer, intent(in) :: globalsizes(:)
1391 integer, intent(in) :: localsizes(:)
1392
1393 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1394
1395 end subroutine accel_kernel_run_4
1396
1397 ! -----------------------------------------------
1398
1399 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1400 max_workgroup_size = accel%max_workgroup_size
1401 end function accel_max_workgroup_size
1402
1403 ! -----------------------------------------------
1404
1405 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1406 type(accel_kernel_t), intent(inout) :: kernel
1407
1408#ifdef HAVE_OPENCL
1409 integer(int64) :: workgroup_size8
1410 integer :: ierr
1411#endif
1412#ifdef HAVE_CUDA
1413 integer :: max_workgroup_size
1414#endif
1415
1416 workgroup_size = 0
1417
1418#ifdef HAVE_OPENCL
1419 call clgetkernelworkgroupinfo(kernel%kernel, accel%device%cl_device, cl_kernel_work_group_size, workgroup_size8, ierr)
1420 if (ierr /= cl_success) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1421 workgroup_size = workgroup_size8
1422#endif
1423
1424#ifdef HAVE_CUDA
1425 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1426 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1427 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1428 "than available on the device (", accel%max_workgroup_size, ")"
1429 call messages_info(1)
1430 end if
1431 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1432 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1433 workgroup_size = 256
1434 ! make sure we do not use more threads per block than available for this kernel
1435 workgroup_size = min(workgroup_size, max_workgroup_size)
1436#endif
1437
1438 end function accel_kernel_workgroup_size
1439
1440 ! -----------------------------------------------
1441
1442#ifdef HAVE_OPENCL
1443 subroutine opencl_build_program(prog, filename, flags)
1444 type(cl_program), intent(inout) :: prog
1445 character(len=*), intent(in) :: filename
1446 character(len=*), optional, intent(in) :: flags
1447
1448 character(len = 1000) :: string
1449 character(len = 256) :: share_string
1450 integer :: ierr, ierrlog, iunit, irec, newlen
1451
1452 push_sub(opencl_build_program)
1453
1454 string = '#include "'//trim(filename)//'"'
1455
1456 call messages_write("Building CL program '"//trim(filename)//"'.")
1457 call messages_info(debug_only=.true.)
1458
1459 prog = clcreateprogramwithsource(accel%context%cl_context, trim(string), ierr)
1460 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateProgramWithSource")
1461
1462 ! build the compilation flags
1463 string='-w'
1464 ! full optimization
1465 string=trim(string)//' -cl-denorms-are-zero'
1466 ! The following flag gives an error with the Xeon Phi
1467 ! string=trim(string)//' -cl-strict-aliasing'
1468 string=trim(string)//' -cl-mad-enable'
1469 string=trim(string)//' -cl-unsafe-math-optimizations'
1470 string=trim(string)//' -cl-finite-math-only'
1471 string=trim(string)//' -cl-fast-relaxed-math'
1472
1473 share_string='-I'//trim(conf%share)//'/opencl/'
1474
1475 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1476 string = trim(string)//' -DEXT_KHR_FP64'
1477 else if (f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1478 string = trim(string)//' -DEXT_AMD_FP64'
1479 else
1480 call messages_write('Octopus requires an OpenCL device with double-precision support.')
1481 call messages_fatal()
1482 end if
1484 if (accel_use_shared_mem()) then
1485 string = trim(string)//' -DSHARED_MEM'
1486 end if
1487
1488 if (present(flags)) then
1489 string = trim(string)//' '//trim(flags)
1490 end if
1491
1492 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1493 call messages_write(' '//trim(share_string)//"'.")
1494 call messages_info(debug_only=.true.)
1495
1496 string = trim(string)//' '//trim(share_string)
1497
1498 call clbuildprogram(prog, trim(string), ierr)
1499
1500 if(ierr /= cl_success) then
1501 call clgetprogrambuildinfo(prog, accel%device%cl_device, cl_program_build_log, string, ierrlog)
1502 if (ierrlog /= cl_success) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1503
1504 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1505 newlen = scan(string, achar(010), back = .true.) - 1
1506 if (newlen >= 0) string = string(1:newlen)
1507
1508 if (len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1509
1510 call opencl_print_error(ierr, "clBuildProgram")
1511 end if
1512
1513 pop_sub(opencl_build_program)
1514 end subroutine opencl_build_program
1515#endif
1516
1517 ! -----------------------------------------------
1518#ifdef HAVE_OPENCL
1519 subroutine opencl_release_program(prog)
1520 type(cl_program), intent(inout) :: prog
1522 integer :: ierr
1523
1524 push_sub(opencl_release_program)
1525
1526 call clreleaseprogram(prog, ierr)
1527 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseProgram")
1528
1529 pop_sub(opencl_release_program)
1530 end subroutine opencl_release_program
1531#endif
1532
1533 ! -----------------------------------------------
1534
1535#ifdef HAVE_OPENCL
1536 subroutine opencl_release_kernel(prog)
1537 type(cl_kernel), intent(inout) :: prog
1538
1539 integer :: ierr
1540
1541 push_sub(opencl_release_kernel)
1542
1543#ifdef HAVE_OPENCL
1544 call clreleasekernel(prog, ierr)
1545 if (ierr /= cl_success) call opencl_print_error(ierr, "clReleaseKernel")
1546#endif
1547
1548 pop_sub(opencl_release_kernel)
1549 end subroutine opencl_release_kernel
1550#endif
1551
1552#ifdef HAVE_OPENCL
1553 ! -----------------------------------------------
1554 subroutine opencl_create_kernel(kernel, prog, name)
1555 type(cl_kernel), intent(inout) :: kernel
1556 type(cl_program), intent(inout) :: prog
1557 character(len=*), intent(in) :: name
1558
1559 integer :: ierr
1560
1561 push_sub(opencl_create_kernel)
1562 call profiling_in("CL_BUILD_KERNEL", exclude = .true.)
1563
1564#ifdef HAVE_OPENCL
1565 kernel = clcreatekernel(prog, name, ierr)
1566 if (ierr /= cl_success) call opencl_print_error(ierr, "clCreateKernel")
1567#endif
1568
1569 call profiling_out("CL_BUILD_KERNEL")
1570 pop_sub(opencl_create_kernel)
1571 end subroutine opencl_create_kernel
1572#endif
1573
1574 ! ------------------------------------------------
1575#ifdef HAVE_OPENCL
1576 subroutine opencl_print_error(ierr, name)
1577 integer, intent(in) :: ierr
1578 character(len=*), intent(in) :: name
1579
1580 character(len=40) :: errcode
1581
1582 push_sub(opencl_print_error)
1583
1584 select case (ierr)
1585 case (cl_success); errcode = 'CL_SUCCESS '
1586 case (cl_device_not_found); errcode = 'CL_DEVICE_NOT_FOUND '
1587 case (cl_device_not_available); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1588 case (cl_compiler_not_available); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1589 case (cl_mem_object_allocation_failure); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1590 case (cl_out_of_resources); errcode = 'CL_OUT_OF_RESOURCES '
1591 case (cl_out_of_host_memory); errcode = 'CL_OUT_OF_HOST_MEMORY '
1592 case (cl_profiling_info_not_available); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1593 case (cl_mem_copy_overlap); errcode = 'CL_MEM_COPY_OVERLAP '
1594 case (cl_image_format_mismatch); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1595 case (cl_image_format_not_supported); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1596 case (cl_build_program_failure); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1597 case (cl_map_failure); errcode = 'CL_MAP_FAILURE '
1598 case (cl_invalid_value); errcode = 'CL_INVALID_VALUE '
1599 case (cl_invalid_device_type); errcode = 'CL_INVALID_DEVICE_TYPE '
1600 case (cl_invalid_platform); errcode = 'CL_INVALID_PLATFORM '
1601 case (cl_invalid_device); errcode = 'CL_INVALID_DEVICE '
1602 case (cl_invalid_context); errcode = 'CL_INVALID_CONTEXT '
1603 case (cl_invalid_queue_properties); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1604 case (cl_invalid_command_queue); errcode = 'CL_INVALID_COMMAND_QUEUE '
1605 case (cl_invalid_host_ptr); errcode = 'CL_INVALID_HOST_PTR '
1606 case (cl_invalid_mem_object); errcode = 'CL_INVALID_MEM_OBJECT '
1607 case (cl_invalid_image_format_descriptor); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1608 case (cl_invalid_image_size); errcode = 'CL_INVALID_IMAGE_SIZE '
1609 case (cl_invalid_sampler); errcode = 'CL_INVALID_SAMPLER '
1610 case (cl_invalid_binary); errcode = 'CL_INVALID_BINARY '
1611 case (cl_invalid_build_options); errcode = 'CL_INVALID_BUILD_OPTIONS '
1612 case (cl_invalid_program); errcode = 'CL_INVALID_PROGRAM '
1613 case (cl_invalid_program_executable); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1614 case (cl_invalid_kernel_name); errcode = 'CL_INVALID_KERNEL_NAME '
1615 case (cl_invalid_kernel_definition); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1616 case (cl_invalid_kernel); errcode = 'CL_INVALID_KERNEL '
1617 case (cl_invalid_arg_index); errcode = 'CL_INVALID_ARG_INDEX '
1618 case (cl_invalid_arg_value); errcode = 'CL_INVALID_ARG_VALUE '
1619 case (cl_invalid_arg_size); errcode = 'CL_INVALID_ARG_SIZE '
1620 case (cl_invalid_kernel_args); errcode = 'CL_INVALID_KERNEL_ARGS '
1621 case (cl_invalid_work_dimension); errcode = 'CL_INVALID_WORK_DIMENSION '
1622 case (cl_invalid_work_group_size); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1623 case (cl_invalid_work_item_size); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1624 case (cl_invalid_global_offset); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1625 case (cl_invalid_event_wait_list); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1626 case (cl_invalid_event); errcode = 'CL_INVALID_EVENT '
1627 case (cl_invalid_operation); errcode = 'CL_INVALID_OPERATION '
1628 case (cl_invalid_gl_object); errcode = 'CL_INVALID_GL_OBJECT '
1629 case (cl_invalid_buffer_size); errcode = 'CL_INVALID_BUFFER_SIZE '
1630 case (cl_invalid_mip_level); errcode = 'CL_INVALID_MIP_LEVEL '
1631 case (cl_invalid_global_work_size); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1632 case (cl_platform_not_found_khr); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1633 case default
1634 write(errcode, '(i10)') ierr
1635 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1636 end select
1637
1638 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1639 call messages_fatal(1)
1640
1641 pop_sub(opencl_print_error)
1642 end subroutine opencl_print_error
1643#endif
1644
1645 ! ----------------------------------------------------
1646
1647 subroutine clblas_print_error(ierr, name)
1648 integer, intent(in) :: ierr
1649 character(len=*), intent(in) :: name
1650
1651 character(len=40) :: errcode
1652
1653 push_sub(clblas_print_error)
1654#if defined(HAVE_CLBLAS) || defined(HAVE_CLBLAST)
1655 select case (ierr)
1656 case (clblassuccess); errcode = 'clblasSuccess'
1657 case (clblasinvalidvalue); errcode = 'clblasInvalidValue'
1658 case (clblasinvalidcommandqueue); errcode = 'clblasInvalidCommandQueue'
1659 case (clblasinvalidcontext); errcode = 'clblasInvalidContext'
1660 case (clblasinvalidmemobject); errcode = 'clblasInvalidMemObject'
1661 case (clblasinvaliddevice); errcode = 'clblasInvalidDevice'
1662 case (clblasinvalideventwaitlist); errcode = 'clblasInvalidEventWaitList'
1663 case (clblasoutofresources); errcode = 'clblasOutOfResources'
1664 case (clblasoutofhostmemory); errcode = 'clblasOutOfHostMemory'
1665 case (clblasinvalidoperation); errcode = 'clblasInvalidOperation'
1666 case (clblascompilernotavailable); errcode = 'clblasCompilerNotAvailable'
1667 case (clblasbuildprogramfailure); errcode = 'clblasBuildProgramFailure'
1668 case (clblasnotimplemented); errcode = 'clblasNotImplemented'
1669 case (clblasnotinitialized); errcode = 'clblasNotInitialized'
1670 case (clblasinvalidmata); errcode = 'clblasInvalidMatA'
1671 case (clblasinvalidmatb); errcode = 'clblasInvalidMatB'
1672 case (clblasinvalidmatc); errcode = 'clblasInvalidMatC'
1673 case (clblasinvalidvecx); errcode = 'clblasInvalidVecX'
1674 case (clblasinvalidvecy); errcode = 'clblasInvalidVecY'
1675 case (clblasinvaliddim); errcode = 'clblasInvalidDim'
1676 case (clblasinvalidleaddima); errcode = 'clblasInvalidLeadDimA'
1677 case (clblasinvalidleaddimb); errcode = 'clblasInvalidLeadDimB'
1678 case (clblasinvalidleaddimc); errcode = 'clblasInvalidLeadDimC'
1679 case (clblasinvalidincx); errcode = 'clblasInvalidIncX'
1680 case (clblasinvalidincy); errcode = 'clblasInvalidIncY'
1681 case (clblasinsufficientmemmata); errcode = 'clblasInsufficientMemMatA'
1682 case (clblasinsufficientmemmatb); errcode = 'clblasInsufficientMemMatB'
1683 case (clblasinsufficientmemmatc); errcode = 'clblasInsufficientMemMatC'
1684 case (clblasinsufficientmemvecx); errcode = 'clblasInsufficientMemVecX'
1685 case (clblasinsufficientmemvecy); errcode = 'clblasInsufficientMemVecY'
1686#ifdef HAVE_CLBLAST
1687 case (clblastinsufficientmemorytemp); errcode = 'clblastInsufficientMemoryTemp'
1688 case (clblastinvalidbatchcount); errcode = 'clblastInvalidBatchCount'
1689 case (clblastinvalidoverridekernel); errcode = 'clblastInvalidOverrideKernel'
1690 case (clblastmissingoverrideparameter); errcode = 'clblastMissingOverrideParameter'
1691 case (clblastinvalidlocalmemusage); errcode = 'clblastInvalidLocalMemUsage'
1692 case (clblastnohalfprecision); errcode = 'clblastNoHalfPrecision'
1693 case (clblastnodoubleprecision); errcode = 'clblastNoDoublePrecision'
1694 case (clblastinvalidvectorscalar); errcode = 'clblastInvalidVectorScalar'
1695 case (clblastinsufficientmemoryscalar); errcode = 'clblastInsufficientMemoryScalar'
1696 case (clblastdatabaseerror); errcode = 'clblastDatabaseError'
1697 case (clblastunknownerror); errcode = 'clblastUnknownError'
1698 case (clblastunexpectederror); errcode = 'clblastUnexpectedError'
1699#endif
1700
1701 case default
1702 write(errcode, '(i10)') ierr
1703 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1704 end select
1705#endif
1706
1707 message(1) = 'Error in calling clblas routine '//trim(name)//' : '//trim(errcode)
1708 call messages_fatal(1)
1709
1710 pop_sub(clblas_print_error)
1711 end subroutine clblas_print_error
1712
1713 ! ----------------------------------------------------
1714 subroutine clfft_print_error(ierr, name)
1715 integer, intent(in) :: ierr
1716 character(len=*), intent(in) :: name
1717
1718 character(len=40) :: errcode
1719
1720 push_sub(clfft_print_error)
1721#ifdef HAVE_CLFFT
1722 select case (ierr)
1723 case (clfft_invalid_global_work_size); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1724 case (clfft_invalid_mip_level); errcode = 'CLFFT_INVALID_MIP_LEVEL'
1725 case (clfft_invalid_buffer_size); errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1726 case (clfft_invalid_gl_object); errcode = 'CLFFT_INVALID_GL_OBJECT'
1727 case (clfft_invalid_operation); errcode = 'CLFFT_INVALID_OPERATION'
1728 case (clfft_invalid_event); errcode = 'CLFFT_INVALID_EVENT'
1729 case (clfft_invalid_event_wait_list); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1730 case (clfft_invalid_global_offset); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1731 case (clfft_invalid_work_item_size); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1732 case (clfft_invalid_work_group_size); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1733 case (clfft_invalid_work_dimension); errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1734 case (clfft_invalid_kernel_args); errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1735 case (clfft_invalid_arg_size); errcode = 'CLFFT_INVALID_ARG_SIZE'
1736 case (clfft_invalid_arg_value); errcode = 'CLFFT_INVALID_ARG_VALUE'
1737 case (clfft_invalid_arg_index); errcode = 'CLFFT_INVALID_ARG_INDEX'
1738 case (clfft_invalid_kernel); errcode = 'CLFFT_INVALID_KERNEL'
1739 case (clfft_invalid_kernel_definition); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1740 case (clfft_invalid_kernel_name); errcode = 'CLFFT_INVALID_KERNEL_NAME'
1741 case (clfft_invalid_program_executable); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1742 case (clfft_invalid_program); errcode = 'CLFFT_INVALID_PROGRAM'
1743 case (clfft_invalid_build_options); errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1744 case (clfft_invalid_binary); errcode = 'CLFFT_INVALID_BINARY'
1745 case (clfft_invalid_sampler); errcode = 'CLFFT_INVALID_SAMPLER'
1746 case (clfft_invalid_image_size); errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1747 case (clfft_invalid_image_format_descriptor); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1748 case (clfft_invalid_mem_object); errcode = 'CLFFT_INVALID_MEM_OBJECT'
1749 case (clfft_invalid_host_ptr); errcode = 'CLFFT_INVALID_HOST_PTR'
1750 case (clfft_invalid_command_queue); errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1751 case (clfft_invalid_queue_properties); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1752 case (clfft_invalid_context); errcode = 'CLFFT_INVALID_CONTEXT'
1753 case (clfft_invalid_device); errcode = 'CLFFT_INVALID_DEVICE'
1754 case (clfft_invalid_platform); errcode = 'CLFFT_INVALID_PLATFORM'
1755 case (clfft_invalid_device_type); errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1756 case (clfft_invalid_value); errcode = 'CLFFT_INVALID_VALUE'
1757 case (clfft_map_failure); errcode = 'CLFFT_MAP_FAILURE'
1758 case (clfft_build_program_failure); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1759 case (clfft_image_format_not_supported); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1760 case (clfft_image_format_mismatch); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1761 case (clfft_mem_copy_overlap); errcode = 'CLFFT_MEM_COPY_OVERLAP'
1762 case (clfft_profiling_info_not_available); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1763 case (clfft_out_of_host_memory); errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1764 case (clfft_out_of_resources); errcode = 'CLFFT_OUT_OF_RESOURCES'
1765 case (clfft_mem_object_allocation_failure); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1766 case (clfft_compiler_not_available); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1767 case (clfft_device_not_available); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1768 case (clfft_device_not_found); errcode = 'CLFFT_DEVICE_NOT_FOUND'
1769 case (clfft_success); errcode = 'CLFFT_SUCCESS'
1770 case (clfft_bugcheck); errcode = 'CLFFT_BUGCHECK'
1771 case (clfft_notimplemented); errcode = 'CLFFT_NOTIMPLEMENTED'
1772 case (clfft_file_not_found); errcode = 'CLFFT_FILE_NOT_FOUND'
1773 case (clfft_file_create_failure); errcode = 'CLFFT_FILE_CREATE_FAILURE'
1774 case (clfft_version_mismatch); errcode = 'CLFFT_VERSION_MISMATCH'
1775 case (clfft_invalid_plan); errcode = 'CLFFT_INVALID_PLAN'
1776 case (clfft_device_no_double); errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1777 case (clfft_endstatus); errcode = 'CLFFT_ENDSTATUS'
1778 case default
1779 write(errcode, '(i10)') ierr
1780 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1781 end select
1782#endif
1783
1784 message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1785 call messages_fatal(1)
1786
1787 pop_sub(clfft_print_error)
1788 end subroutine clfft_print_error
1789
1790 ! ----------------------------------------------------
1791
1792#ifdef HAVE_OPENCL
1793 logical function f90_cl_device_has_extension(device, extension) result(has)
1794 type(cl_device_id), intent(inout) :: device
1795 character(len=*), intent(in) :: extension
1796
1797 integer :: cl_status
1798 character(len=2048) :: all_extensions
1799
1800#ifdef HAVE_OPENCL
1801 call clgetdeviceinfo(device, cl_device_extensions, all_extensions, cl_status)
1802#endif
1803
1804 has = index(all_extensions, extension) /= 0
1805
1806 end function f90_cl_device_has_extension
1807#endif
1808
1809 ! ----------------------------------------------------
1810
1811 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1812 type(accel_mem_t), intent(inout) :: buffer
1813 type(type_t), intent(in) :: type
1814 integer(int64), intent(in) :: nval
1815 integer(int64), optional, intent(in) :: offset
1816 logical, optional, intent(in) :: async
1817
1818 integer :: bsize
1819 integer(int64) :: nval_real, offset_real
1820 type(accel_kernel_t), pointer :: kernel
1821
1822 if (nval == 0) return
1823
1825
1826 nval_real = nval
1827 if (type == type_cmplx) nval_real = nval_real * 2
1828 if (present(offset)) then
1829 offset_real = offset
1830 if (type == type_cmplx) offset_real = offset_real * 2
1831 else
1832 offset_real = 0_int64
1833 end if
1834
1835 assert(nval_real > 0)
1836
1837 if (type == type_integer) then
1838 kernel => set_zero_int
1839 else
1840 kernel => set_zero
1841 end if
1842 call accel_set_kernel_arg(kernel, 0, nval_real)
1843 call accel_set_kernel_arg(kernel, 1, offset_real)
1844 call accel_set_kernel_arg(kernel, 2, buffer)
1845
1846 bsize = accel_kernel_workgroup_size(kernel)
1847
1848 call accel_kernel_run(kernel, (/ nval_real /), (/ 1_int64 /))
1849
1850 if(.not. optional_default(async, .false.)) call accel_finish()
1851
1853 end subroutine accel_set_buffer_to_zero_i8
1854
1855 ! ----------------------------------------------------
1856
1857 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1858 type(accel_mem_t), intent(inout) :: buffer
1859 type(type_t), intent(in) :: type
1860 integer(int32), intent(in) :: nval
1861 integer(int32), optional, intent(in) :: offset
1862 logical, optional, intent(in) :: async
1863
1865
1866 if (present(offset)) then
1867 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1868 else
1869 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1870 end if
1873 end subroutine accel_set_buffer_to_zero_i4
1874
1875 ! ----------------------------------------------------
1876
1877 subroutine opencl_check_bandwidth()
1878 integer :: itime
1879 integer, parameter :: times = 10
1880 integer :: size
1881 real(real64) :: time, stime
1882 real(real64) :: read_bw, write_bw
1883 type(accel_mem_t) :: buff
1884 real(real64), allocatable :: data(:)
1885
1886 call messages_new_line()
1887 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1888 call messages_new_line()
1889 call messages_info()
1890
1891 call messages_write(' Buffer size Read bw Write bw')
1892 call messages_new_line()
1893 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1894 call messages_info()
1895
1896 size = 15000
1897 do
1898 safe_allocate(data(1:size))
1899 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1900
1901 stime = loct_clock()
1902 do itime = 1, times
1903 call accel_write_buffer(buff, size, data)
1904 call accel_finish()
1905 end do
1906 time = (loct_clock() - stime)/real(times, real64)
1907
1908 write_bw = real(size, real64) *8.0_real64/time
1909
1910 stime = loct_clock()
1911 do itime = 1, times
1912 call accel_read_buffer(buff, size, data)
1913 end do
1914 call accel_finish()
1915
1916 time = (loct_clock() - stime)/real(times, real64)
1917 read_bw = real(size, real64) *8.0_real64/time
1918
1919 call messages_write(size*8.0_real64/1024.0_real64**2)
1920 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1921 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1922 call messages_info()
1923
1924 call accel_release_buffer(buff)
1925
1926 safe_deallocate_a(data)
1927
1928 size = int(size*2.0)
1929
1930 if (size > 50000000) exit
1931 end do
1932 end subroutine opencl_check_bandwidth
1933
1934 ! ----------------------------------------------------
1936 logical pure function accel_use_shared_mem() result(use_shared_mem)
1937
1938 use_shared_mem = accel%shared_mem
1939
1940 end function accel_use_shared_mem
1941
1942 !------------------------------------------------------------
1943
1944 subroutine accel_kernel_global_init()
1945
1946 push_sub(accel_kernel_global_init)
1947
1948 nullify(head)
1949
1950 call cuda_module_map_init(accel%module_map)
1951
1953 end subroutine accel_kernel_global_init
1954
1955 !------------------------------------------------------------
1956
1957 subroutine accel_kernel_global_end()
1958 type(accel_kernel_t), pointer :: next_head
1959
1960 push_sub(accel_kernel_global_end)
1961
1962 do
1963 if (.not. associated(head)) exit
1964 next_head => head%next
1966 head => next_head
1967 end do
1968
1969 if (accel_is_enabled()) then
1970 call cuda_module_map_end(accel%module_map)
1971 end if
1972
1974 end subroutine accel_kernel_global_end
1975
1976 !------------------------------------------------------------
1977
1978 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1979 type(accel_kernel_t), intent(inout) :: this
1980 character(len=*), intent(in) :: file_name
1981 character(len=*), intent(in) :: kernel_name
1982 character(len=*), optional, intent(in) :: flags
1983
1984#ifdef HAVE_OPENCL
1985 type(cl_program) :: prog
1986#endif
1987#ifdef HAVE_CUDA
1988 character(len=1000) :: all_flags
1989#endif
1990
1991 push_sub(accel_kernel_build)
1992
1993 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1994
1995#ifdef HAVE_CUDA
1996 all_flags = '-I'//trim(conf%share)//'/opencl/'//" "//trim(accel%debug_flag)
1997
1998 if (accel_use_shared_mem()) then
1999 all_flags = trim(all_flags)//' -DSHARED_MEM'
2000 end if
2001
2002 if (present(flags)) then
2003 all_flags = trim(all_flags)//' '//trim(flags)
2004 end if
2005
2006 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
2007
2008 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
2009 call cuda_alloc_arg_array(this%arguments)
2010
2011 this%cuda_shared_mem = 0
2012#endif
2013
2014#ifdef HAVE_OPENCL
2015 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
2016 call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
2017 call opencl_release_program(prog)
2018#endif
2019
2020 this%initialized = .true.
2021 this%kernel_name = trim(kernel_name)
2023 call profiling_out("ACCEL_COMPILE")
2024
2025 pop_sub(accel_kernel_build)
2026 end subroutine accel_kernel_build
2027
2028 !------------------------------------------------------------
2029
2030 subroutine accel_kernel_end(this)
2031 type(accel_kernel_t), intent(inout) :: this
2032#ifdef HAVE_OPENCL
2033 integer :: ierr
2034#endif
2036 push_sub(accel_kernel_end)
2037
2038#ifdef HAVE_CUDA
2039 call cuda_free_arg_array(this%arguments)
2040 call cuda_release_kernel(this%cuda_kernel)
2041 ! modules are not released here, since they are not associated to a kernel
2042#endif
2043
2044#ifdef HAVE_OPENCL
2045 call clreleasekernel(this%kernel, ierr)
2046 if (ierr /= cl_success) call opencl_print_error(ierr, "release_kernel")
2047#endif
2048 this%initialized = .false.
2049
2050 pop_sub(accel_kernel_end)
2051 end subroutine accel_kernel_end
2052
2053 !------------------------------------------------------------
2054
2055 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
2056 type(accel_kernel_t), target, intent(inout) :: this
2057 character(len=*), intent(in) :: file_name
2058 character(len=*), intent(in) :: kernel_name
2059 character(len=*), optional, intent(in) :: flags
2060
2061 push_sub(accel_kernel_start_call)
2062
2063 if (.not. this%initialized) then
2064 call accel_kernel_build(this, file_name, kernel_name, flags)
2065 this%next => head
2066 head => this
2067 end if
2068
2070 end subroutine accel_kernel_start_call
2071
2072 !--------------------------------------------------------------
2073
2074 integer(int64) pure function accel_global_memory_size() result(size)
2075
2076 size = accel%global_memory_size
2077
2078 end function accel_global_memory_size
2079
2080 !--------------------------------------------------------------
2081
2082 integer(int64) pure function accel_local_memory_size() result(size)
2083
2084 size = accel%local_memory_size
2085
2086 end function accel_local_memory_size
2087
2088 !--------------------------------------------------------------
2089
2090 integer pure function accel_max_size_per_dim(dim) result(size)
2091 integer, intent(in) :: dim
2092
2093 size = 0
2094#ifdef HAVE_OPENCL
2095 size = 32768 ! Setting here arbitrarily higher dimensions to 32768, as 2**30 leads to a
2096 ! value of zero when multiplied by 2048 and converted to integer 4.
2097 if (dim == 1) size = 2**30
2098#endif
2099#ifdef HAVE_CUDA
2100 size = 32768
2101 if (dim == 1) size = 2**30
2102#endif
2103 end function accel_max_size_per_dim
2104
2105 ! ------------------------------------------------------
2106
2107 subroutine accel_set_stream(stream_number)
2108 integer, intent(in) :: stream_number
2109
2110 push_sub(accel_set_stream)
2111
2112 if (accel_is_enabled()) then
2113#ifdef HAVE_CUDA
2114 call cuda_set_stream(accel%cuda_stream, stream_number)
2115 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
2116#endif
2117 end if
2118
2119 pop_sub(accel_set_stream)
2120 end subroutine accel_set_stream
2121
2122 ! ------------------------------------------------------
2123
2124 subroutine accel_get_stream(stream_number)
2125 integer, intent(inout) :: stream_number
2126
2127 push_sub(accel_get_stream)
2128
2129 if (accel_is_enabled()) then
2130#ifdef HAVE_CUDA
2131 call cuda_get_stream(stream_number)
2132#endif
2133 end if
2134
2135 pop_sub(accel_get_stream)
2136 end subroutine accel_get_stream
2137
2138 ! ------------------------------------------------------
2139
2142
2143 if (accel_is_enabled()) then
2144#ifdef HAVE_CUDA
2145 call cuda_synchronize_all_streams()
2146#endif
2147 end if
2148
2150 end subroutine accel_synchronize_all_streams
2151
2152 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2153 type(c_ptr), intent(in) :: buffer
2154 integer(int64), intent(in) :: offset
2155 type(c_ptr) :: buffer_offset
2156
2158#ifdef HAVE_CUDA
2159 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
2160#else
2161 ! this is needed to make the compiler happy for non-GPU compilations
2162 buffer_offset = buffer
2163#endif
2166
2167 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
2168 type(c_ptr), intent(in) :: buffer
2169 integer(int64), intent(in) :: offset
2170 type(c_ptr) :: buffer_offset
2171
2173#ifdef HAVE_CUDA
2174 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
2175#else
2176 ! this is needed to make the compiler happy for non-GPU compilations
2177 buffer_offset = buffer
2178#endif
2181
2182 subroutine accel_clean_pointer(buffer)
2183 type(c_ptr), intent(in) :: buffer
2184
2186#ifdef HAVE_CUDA
2187 call cuda_clean_pointer(buffer)
2188#endif
2189 pop_sub(accel_clean_pointer)
2190 end subroutine accel_clean_pointer
2191
2195 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
2196 integer(int64), intent(in) :: size
2197 integer(int64), intent(out) :: grid_size
2198 integer(int64), intent(out) :: thread_block_size
2199
2200 push_sub(accel_get_unfolded_size)
2201#ifdef __HIP_PLATFORM_AMD__
2202 ! not benefitial for AMD chips
2203 grid_size = size
2204 thread_block_size = size
2205#else
2206 grid_size = size * accel%warp_size
2207 thread_block_size = accel%warp_size
2208#endif
2210 end subroutine accel_get_unfolded_size
2211
2212#include "undef.F90"
2213#include "real.F90"
2214#include "accel_inc.F90"
2215
2216#include "undef.F90"
2217#include "complex.F90"
2218#include "accel_inc.F90"
2219
2220#include "undef.F90"
2221#include "integer.F90"
2222#include "accel_inc.F90"
2223
2224#include "undef.F90"
2225#include "integer8.F90"
2226#include "accel_inc.F90"
2227
2228end module accel_oct_m
2229
2230!! Local Variables:
2231!! mode: f90
2232!! coding: utf-8
2233!! End:
subroutine select_device(idevice)
Definition: accel.F90:866
subroutine device_info()
Definition: accel.F90:895
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4684
integer, parameter opencl_accelerator
Definition: accel.F90:378
type(accel_kernel_t), target, save, public kernel_density_real
Definition: accel.F90:287
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3451
integer, parameter opencl_default
Definition: accel.F90:378
type(accel_kernel_t), target, save, public kernel_vpsi_complex
Definition: accel.F90:274
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:295
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:2261
subroutine accel_kernel_global_end()
Definition: accel.F90:2036
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:2274
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:413
subroutine iaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3843
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2755
subroutine accel_create_buffer_8(this, flags, type, size, set_zero)
Definition: accel.F90:1203
subroutine laccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4429
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:2015
subroutine laccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4527
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3360
type(accel_kernel_t), target, save, public zkernel_dot_matrix
Definition: accel.F90:293
subroutine daccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2475
type(accel_kernel_t), target, save, public zpack
Definition: accel.F90:283
type(accel_kernel_t), target, save, public dkernel_batch_dotp
Definition: accel.F90:299
subroutine laccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:4250
subroutine zaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3137
type(accel_kernel_t), target, save, public kernel_vpsi_spinors
Definition: accel.F90:275
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4571
subroutine daccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2372
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2964
type(accel_kernel_t), target, save, public kernel_ghost_reorder
Definition: accel.F90:286
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4078
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4595
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:296
integer, parameter cl_plat_nvidia
Definition: accel.F90:385
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:2134
subroutine iaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3723
subroutine daccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2633
integer, parameter cl_plat_ati
Definition: accel.F90:385
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:2203
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:2153
subroutine daccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2673
subroutine iaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3804
subroutine daccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2532
subroutine zaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3039
subroutine zaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3099
type(accel_kernel_t), target, save, public zkernel_dot_matrix_spinors
Definition: accel.F90:294
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:298
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3335
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2868
subroutine laccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4470
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:1347
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2846
subroutine zaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3080
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:1161
subroutine iaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3902
type(accel_kernel_t), target, save set_zero_int
Definition: accel.F90:307
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4033
subroutine, public accel_finish()
Definition: accel.F90:1302
subroutine opencl_check_bandwidth()
Definition: accel.F90:1956
subroutine accel_kernel_global_init()
Definition: accel.F90:2023
subroutine daccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2415
type(accel_kernel_t), target, save, public set_one
Definition: accel.F90:303
type(accel_kernel_t), target, save, public kernel_daxpy
Definition: accel.F90:277
subroutine laccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4367
subroutine opencl_release_program(prog)
Definition: accel.F90:1598
type(accel_kernel_t), target, save set_zero
Definition: accel.F90:306
subroutine zaccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3238
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3965
subroutine laccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4270
integer, parameter cl_plat_invalid
Definition: accel.F90:385
type(accel_kernel_t), target, save, public kernel_density_spinors
Definition: accel.F90:289
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4546
subroutine daccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2692
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:4175
subroutine daccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2434
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2779
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1890
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3428
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:1294
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine zaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3259
subroutine, public clfft_print_error(ierr, name)
Definition: accel.F90:1793
subroutine accel_kernel_end(this)
Definition: accel.F90:2109
subroutine iaccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3685
integer, parameter opencl_gpu
Definition: accel.F90:378
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:297
subroutine iaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3823
subroutine opencl_release_kernel(prog)
Definition: accel.F90:1615
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3384
subroutine zaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2977
subroutine zaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3278
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2231
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:3569
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3989
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:2169
type(accel_kernel_t), target, save, public dpack
Definition: accel.F90:282
subroutine iaccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3742
subroutine laccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4231
subroutine daccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:2594
subroutine, public accel_release_buffer(this)
Definition: accel.F90:1252
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4662
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4011
subroutine zaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3020
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2730
subroutine zaccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3297
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:1386
subroutine laccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4310
subroutine laccel_read_buffer_3(this, size, data, offset, async)
Definition: accel.F90:4449
type(accel_kernel_t), target, save, public kernel_vpsi_spinors_complex
Definition: accel.F90:276
subroutine opencl_build_program(prog, filename, flags)
Definition: accel.F90:1522
subroutine laccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4329
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:2057
subroutine iaccel_write_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3644
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:423
subroutine, public accel_end(namespace)
Definition: accel.F90:1062
subroutine iaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3761
subroutine opencl_create_kernel(kernel, prog, name)
Definition: accel.F90:1633
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:2219
subroutine iaccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3883
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:2186
type(accel_kernel_t), target, save, public zunpack
Definition: accel.F90:285
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4639
subroutine zaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3316
subroutine laccel_read_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:4508
type(accel_kernel_t), target, save, public kernel_zaxpy
Definition: accel.F90:278
subroutine laccel_read_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:4489
integer, parameter cl_plat_amd
Definition: accel.F90:385
subroutine zaccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:3218
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:1182
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1936
subroutine daccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:2513
subroutine iaccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:3704
type(accel_kernel_t), target, save, public zkernel_batch_dotp
Definition: accel.F90:300
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4056
subroutine daccel_write_buffer_1_int32(this, size, data, offset, async)
Definition: accel.F90:2494
subroutine iaccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3582
pure logical function, public accel_is_enabled()
Definition: accel.F90:403
subroutine zaccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:3156
integer, parameter cl_plat_intel
Definition: accel.F90:385
integer, parameter, public accel_mem_write_only
Definition: accel.F90:183
subroutine daccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:2711
subroutine daccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:2654
subroutine laccel_write_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:4291
subroutine zaccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3199
type(accel_kernel_t), target, save, public kernel_vpsi
Definition: accel.F90:273
logical function f90_cl_device_has_extension(device, extension)
Definition: accel.F90:1872
subroutine opencl_print_error(ierr, name)
Definition: accel.F90:1655
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2823
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1467
subroutine zaccel_write_buffer_2_int32(this, size, data, offset, async)
Definition: accel.F90:3118
subroutine iaccel_write_buffer_1(this, size, data, offset, async)
Definition: accel.F90:3625
subroutine laccel_write_buffer_0(this, size, data, offset, async)
Definition: accel.F90:4188
type(accel_kernel_t), target, save, public kernel_density_complex
Definition: accel.F90:288
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3406
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:2246
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:2359
subroutine daccel_read_buffer_0(this, size, data, offset, async)
Definition: accel.F90:2551
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1484
subroutine laccel_read_buffer_1(this, size, data, offset, async)
Definition: accel.F90:4410
integer, parameter opencl_cpu
Definition: accel.F90:378
subroutine zaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3059
integer function get_platform_id(platform_name)
Definition: accel.F90:1050
subroutine, public clblas_print_error(ierr, name)
Definition: accel.F90:1726
type(accel_t), public accel
Definition: accel.F90:270
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4617
subroutine accel_create_buffer_4(this, flags, type, size, set_zero)
Definition: accel.F90:1191
subroutine laccel_write_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:4348
subroutine daccel_read_buffer_2(this, size, data, offset, async)
Definition: accel.F90:2613
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3940
subroutine daccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:2454
integer, public cl_status
Definition: accel.F90:393
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3473
subroutine iaccel_read_buffer_0_int32(this, size, data, offset, async)
Definition: accel.F90:3864
type(accel_kernel_t), target, save, public dunpack
Definition: accel.F90:284
subroutine iaccel_write_buffer_3(this, size, data, offset, async)
Definition: accel.F90:3664
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:2161
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:1322
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1478
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2801
type(accel_kernel_t), pointer head
Definition: accel.F90:397
subroutine iaccel_read_buffer_3_int32(this, size, data, offset, async)
Definition: accel.F90:3921
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
integer, parameter, public clblassuccess
Definition: clblas.F90:193
real(real64), parameter, public m_zero
Definition: global.F90:188
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:920
character(len=512), private msg
Definition: messages.F90:165
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:537
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1045
subroutine, public messages_new_line()
Definition: messages.F90:1134
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:414
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:616
logical function mpi_grp_is_root(grp)
Is the current MPI process of grpcomm, root.
Definition: mpi.F90:434
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:623
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:552
integer pure function, public types_get_size(this)
Definition: types.F90:152
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)