Octopus
accel.F90
Go to the documentation of this file.
1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_CUDA)
22#define HAVE_ACCEL 1
23#endif
24
25module accel_oct_m
27 use cuda_oct_m
28 use debug_oct_m
29 use global_oct_m
30 use iso_c_binding, only: c_size_t
31 use, intrinsic :: iso_fortran_env
32 use loct_oct_m
33 use math_oct_m
35 use mpi_oct_m
37 use types_oct_m
38 use parser_oct_m
42 use string_oct_m
43
44 implicit none
45
46 private
47
48 public :: &
53 accel_t, &
56 accel_init, &
57 accel_end, &
87
88 integer, public, parameter :: &
89 ACCEL_MEM_READ_ONLY = 0, &
92
94 ! Components are public by default
95#if defined(HAVE_CUDA)
96 type(c_ptr) :: cuda_context
97#else
98 integer :: dummy
99#endif
100 end type accel_context_t
101
102 type accel_device_t
103 ! Components are public by default
104#if defined(HAVE_CUDA)
105 type(c_ptr) :: cuda_device
106#else
107 integer :: dummy
108#endif
109 end type accel_device_t
110
111 type accel_t
112 ! Components are public by default
113 type(accel_context_t) :: context
114 type(accel_device_t) :: device
115 type(c_ptr) :: cublas_handle
116 type(c_ptr) :: cuda_stream
117 type(c_ptr) :: module_map
118 integer :: max_workgroup_size
119 integer(int64) :: local_memory_size
120 integer(int64) :: global_memory_size
121 logical :: enabled
122 logical :: allow_CPU_only
123 logical :: shared_mem
124 logical :: cuda_mpi
125 integer :: warp_size
126 integer(int64) :: initialize_buffers
127 character(len=32) :: debug_flag
128 integer(int64) :: max_block_dim(3)
129 integer(int64) :: max_grid_dim(3)
130 end type accel_t
131
132 type accel_mem_t
133 ! Components are public by default
134 type(c_ptr) :: mem
135 integer(c_size_t) :: size = 0
136 type(type_t) :: type
137 integer :: flags = 0
138 logical :: allocated = .false.
139 end type accel_mem_t
140
141 type accel_kernel_t
142 ! Components are public by default
143#ifdef HAVE_CUDA
144 type(c_ptr) :: cuda_kernel
145 type(c_ptr) :: cuda_module
146 type(c_ptr) :: arguments
147#endif
148 integer(int64) :: cuda_shared_mem
149 logical :: initialized = .false.
150 type(accel_kernel_t), pointer :: next
151 integer :: arg_count
152 character(len=128) :: kernel_name
153 end type accel_kernel_t
154
155 type(accel_t), public :: accel
156
157 ! Global variables defined on device
158 type(accel_mem_t), public, save :: zM_0_buffer, zM_1_buffer
159 type(accel_mem_t), public, save :: dM_0_buffer, dM_1_buffer
160
161 ! the kernels
162 type(accel_kernel_t), public, target, save :: kernel_vpsi
163 type(accel_kernel_t), public, target, save :: kernel_vpsi_complex
164 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
165 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors_complex
166 type(accel_kernel_t), public, target, save :: kernel_daxpy
167 type(accel_kernel_t), public, target, save :: kernel_zaxpy
168 type(accel_kernel_t), public, target, save :: kernel_copy
169 type(accel_kernel_t), public, target, save :: kernel_copy_complex_to_real
170 type(accel_kernel_t), public, target, save :: kernel_copy_real_to_complex
171 type(accel_kernel_t), public, target, save :: dpack
172 type(accel_kernel_t), public, target, save :: zpack
173 type(accel_kernel_t), public, target, save :: dunpack
174 type(accel_kernel_t), public, target, save :: zunpack
175 type(accel_kernel_t), public, target, save :: kernel_ghost_reorder
176 type(accel_kernel_t), public, target, save :: kernel_density_real
177 type(accel_kernel_t), public, target, save :: kernel_density_complex
178 type(accel_kernel_t), public, target, save :: kernel_density_spinors
179 type(accel_kernel_t), public, target, save :: kernel_phase
180 type(accel_kernel_t), public, target, save :: kernel_phase_spiral
181 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
182 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
183 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
184 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
185 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
186 type(accel_kernel_t), public, target, save :: dkernel_ax_function_py
187 type(accel_kernel_t), public, target, save :: zkernel_ax_function_py
188 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
189 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
190 type(accel_kernel_t), public, target, save :: dzmul
191 type(accel_kernel_t), public, target, save :: zzmul
192
195 end interface accel_padded_size
196
199 end interface accel_create_buffer
200
201 interface accel_kernel_run
203 end interface accel_kernel_run
204
207 end interface accel_set_buffer_to_zero
223 end interface accel_write_buffer
225 interface accel_read_buffer
234 end interface accel_read_buffer
235
237 module procedure &
262 module procedure &
270 module procedure &
278 integer(int64) :: allocated_mem
279 type(accel_kernel_t), pointer :: head
282contains
284 pure logical function accel_is_enabled() result(enabled)
285#ifdef HAVE_ACCEL
286 enabled = accel%enabled
287#else
288 enabled = .false.
289#endif
290 end function accel_is_enabled
291
292 ! ------------------------------------------
293
294 pure logical function accel_allow_cpu_only() result(allow)
295#ifdef HAVE_ACCEL
296 allow = accel%allow_CPU_only
297#else
298 allow = .true.
299#endif
301
302 ! ------------------------------------------
303
304 subroutine accel_init(base_grp, namespace)
305 type(mpi_grp_t), intent(inout) :: base_grp
306 type(namespace_t), intent(in) :: namespace
307
308 logical :: disable, default, run_benchmark
309 integer :: idevice
310#ifdef HAVE_CUDA
311 integer :: dim
312#ifdef HAVE_MPI
313 character(len=256) :: sys_name
314#endif
315#endif
316
317 push_sub(accel_init)
318
321 !%Variable DisableAccel
322 !%Type logical
323 !%Default yes
324 !%Section Execution::Accel
325 !%Description
326 !% If Octopus was compiled with CUDA support, it will
327 !% try to initialize and use an accelerator device. By setting this
328 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
329 !%End
330 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
331#ifdef HAVE_ACCEL
332 default = .false.
333#else
334 default = .true.
335#endif
336 call parse_variable(namespace, 'DisableAccel', default, disable)
337 accel%enabled = .not. disable
338
339#ifndef HAVE_ACCEL
340 if (accel%enabled) then
341 message(1) = 'Octopus was compiled without Cuda support.'
342 call messages_fatal(1)
343 end if
344#endif
345
346 if (.not. accel_is_enabled()) then
347 pop_sub(accel_init)
348 return
349 end if
350
351 call messages_obsolete_variable(namespace, 'AccelPlatform')
352 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
353
354 !%Variable AccelDevice
355 !%Type integer
356 !%Default 0
357 !%Section Execution::Accel
358 !%Description
359 !% This variable selects the GPU that Octopus will use. You can specify a
360 !% numerical id to select a specific device.
361 !%
362 !% In case of MPI enabled runs devices are distributed in a round robin fashion,
363 !% starting at this value.
364 !%End
365 call parse_variable(namespace, 'AccelDevice', 0, idevice)
366
367 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
368
369 if (idevice < 0) then
370 call messages_write('Invalid AccelDevice')
371 call messages_fatal()
372 end if
374 call messages_print_with_emphasis(msg="GPU acceleration", namespace=namespace)
376#ifdef HAVE_CUDA
377 if (idevice<0) idevice = 0
378 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
379 idevice, base_grp%rank)
380#ifdef HAVE_MPI
381 call loct_sysname(sys_name)
382 write(message(1), '(A,I5,A,I5,2A)') "Rank ", base_grp%rank, " uses device number ", idevice, &
383 " on ", trim(sys_name)
384 call messages_info(1, all_nodes = .true.)
385#endif
386
387 accel%shared_mem = .true.
388
389 call cublas_init(accel%cublas_handle, accel%cuda_stream)
390#endif
391
392
393 ! Get some device information that we will need later
394#ifdef HAVE_CUDA
395 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
396 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
397 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
398 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
399 call cuda_device_max_block_dim_x(accel%device%cuda_device, dim)
400 accel%max_block_dim(1) = int(dim, int64)
401 call cuda_device_max_block_dim_y(accel%device%cuda_device, dim)
402 accel%max_block_dim(2) = int(dim, int64)
403 call cuda_device_max_block_dim_z(accel%device%cuda_device, dim)
404 accel%max_block_dim(3) = int(dim, int64)
405 call cuda_device_max_grid_dim_x(accel%device%cuda_device, dim)
406 accel%max_grid_dim(1) = int(dim, int64)
407 call cuda_device_max_grid_dim_y(accel%device%cuda_device, dim)
408 accel%max_grid_dim(2) = int(dim, int64)
409 call cuda_device_max_grid_dim_z(accel%device%cuda_device, dim)
410 accel%max_grid_dim(3) = int(dim, int64)
411#endif
412
413 if (base_grp%is_root()) call device_info()
414
415 ! initialize the cache used to speed up allocations
416 call alloc_cache_init(memcache, nint(0.25_real64*accel%global_memory_size, int64))
417
418 ! now initialize the kernels
420
421#if defined(HAVE_HIP)
422 accel%debug_flag = "-g"
423#elif defined(HAVE_CUDA)
424 accel%debug_flag = "-lineinfo"
425#endif
426
427 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cu', "vpsi")
428 call accel_kernel_start_call(kernel_vpsi_complex, 'vpsi.cu', "vpsi_complex")
429 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cu', "vpsi_spinors")
430 call accel_kernel_start_call(kernel_vpsi_spinors_complex, 'vpsi.cu', "vpsi_spinors_complex")
431 call accel_kernel_start_call(kernel_daxpy, 'axpy.cu', "daxpy", flags = '-DRTYPE_DOUBLE')
432 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cu', "zaxpy", flags = '-DRTYPE_COMPLEX')
433 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cu', "dbatch_axpy_function", &
434 flags = ' -DRTYPE_DOUBLE')
435 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cu', "zbatch_axpy_function", &
436 flags = '-DRTYPE_COMPLEX')
437 call accel_kernel_start_call(dkernel_ax_function_py, 'axpy.cu', "dbatch_ax_function_py", &
438 flags = '-DRTYPE_DOUBLE')
439 call accel_kernel_start_call(zkernel_ax_function_py, 'axpy.cu', "zbatch_ax_function_py", &
440 flags = '-DRTYPE_COMPLEX')
441 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cu', "dbatch_mf_dotp")
442 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cu', "zbatch_mf_dotp")
443 call accel_kernel_start_call(dpack, 'pack.cu', "dpack")
444 call accel_kernel_start_call(zpack, 'pack.cu', "zpack")
445 call accel_kernel_start_call(dunpack, 'pack.cu', "dunpack")
446 call accel_kernel_start_call(zunpack, 'pack.cu', "zunpack")
447 call accel_kernel_start_call(kernel_copy, 'copy.cu', "copy")
448 call accel_kernel_start_call(kernel_copy_complex_to_real, 'copy.cu', "copy_complex_to_real")
449 call accel_kernel_start_call(kernel_copy_real_to_complex, 'copy.cu', "copy_real_to_complex")
450 call accel_kernel_start_call(kernel_ghost_reorder, 'ghost.cu', "ghost_reorder")
451 call accel_kernel_start_call(kernel_density_real, 'density.cu', "density_real")
452 call accel_kernel_start_call(kernel_density_complex, 'density.cu', "density_complex")
453 call accel_kernel_start_call(kernel_density_spinors, 'density.cu', "density_spinors")
454 call accel_kernel_start_call(kernel_phase, 'phase.cu', "phase")
455 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cu', "ddot_matrix")
456 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cu', "zdot_matrix")
457 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cu', "zdot_matrix_spinors")
458
459
460 call accel_kernel_start_call(dzmul, 'mul.cu', "dzmul", flags = '-DRTYPE_DOUBLE')
461 call accel_kernel_start_call(zzmul, 'mul.cu', "zzmul", flags = '-DRTYPE_COMPLEX')
462
463 ! Define global buffers
464 if(.not. accel_buffer_is_allocated(zm_0_buffer)) then
465 call accel_create_buffer(zm_0_buffer, accel_mem_read_only, type_cmplx, 1)
466 call accel_write_buffer(zm_0_buffer, m_z0)
467 end if
468 if(.not. accel_buffer_is_allocated(zm_1_buffer)) then
469 call accel_create_buffer(zm_1_buffer, accel_mem_read_only, type_cmplx, 1)
470 call accel_write_buffer(zm_1_buffer, m_z1)
471 end if
472 if(.not. accel_buffer_is_allocated(dm_0_buffer)) then
473 call accel_create_buffer(dm_0_buffer, accel_mem_read_only, type_float, 1)
474 call accel_write_buffer(dm_0_buffer, m_zero)
475 end if
476 if(.not. accel_buffer_is_allocated(dm_1_buffer)) then
477 call accel_create_buffer(dm_1_buffer, accel_mem_read_only, type_float, 1)
478 call accel_write_buffer(dm_1_buffer, m_one)
479 end if
480
481
482 !%Variable AccelBenchmark
483 !%Type logical
484 !%Default no
485 !%Section Execution::Accel
486 !%Description
487 !% If this variable is set to yes, Octopus will run some
488 !% routines to benchmark the performance of the accelerator device.
489 !%End
490 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
491
492 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
493
494 if (run_benchmark) then
496 end if
497
498 !%Variable GPUAwareMPI
499 !%Type logical
500 !%Section Execution::Accel
501 !%Description
502 !% If Octopus was compiled with GPU support and MPI support and if the MPI
503 !% implementation is GPU-aware (i.e., it supports communication using device pointers),
504 !% this switch can be set to true to use the GPU-aware MPI features. The advantage
505 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
506 !% going through the host memory.
507 !% The default is false, except when the configure switch --enable-cudampi is set, in which
508 !% case this variable is set to true.
509 !%End
510#ifdef HAVE_CUDA_MPI
511 default = .true.
512#else
513 default = .false.
514#endif
515 call parse_variable(namespace, 'GPUAwareMPI', default, accel%cuda_mpi)
516 if (accel%cuda_mpi) then
517#ifndef HAVE_CUDA_MPI
518 call messages_write("Warning: trying to use GPU-aware MPI, but we have not detected support in the linked MPI library.")
519 call messages_warning()
520#endif
521 call messages_write("Using GPU-aware MPI.")
522 call messages_info()
523 end if
524
525
526 !%Variable AllowCPUonly
527 !%Type logical
528 !%Section Execution::Accel
529 !%Description
530 !% In order to prevent waste of resources, the code will normally stop when the GPU is disabled due to
531 !% incomplete implementations or incompatibilities. AllowCPUonly = yes overrides this and allows the
532 !% code execution also in these cases.
533 !%End
534#if defined (HAVE_ACCEL)
535 default = .false.
536#else
537 default = .true.
538#endif
539 call parse_variable(namespace, 'AllowCPUonly', default, accel%allow_CPU_only)
540
541
542 !%Variable InitializeGPUBuffers
543 !%Type integer
544 !%Default no
545 !%Section Execution::Accel
546 !%Description
547 !% Initialize new GPU buffers to zero on creation (use only for debugging, as it has a performance impact!).
548 !%Option no 0
549 !% Do not initialize GPU buffers.
550 !%Option yes 1
551 !% Initialize GPU buffers to zero.
552 !%Option nan 2
553 !% Initialize GPU buffers to nan.
554 !%End
555 call parse_variable(namespace, 'InitializeGPUBuffers', option__initializegpubuffers__no, accel%initialize_buffers)
556 if (.not. varinfo_valid_option('InitializeGPUBuffers', accel%initialize_buffers)) then
557 call messages_input_error(namespace, 'InitializeGPUBuffers')
558 end if
559
560
561 call messages_print_with_emphasis(namespace=namespace)
562
563 pop_sub(accel_init)
564
565 contains
566
567 subroutine device_info()
568#ifdef HAVE_CUDA
569 integer :: version
570 character(kind=c_char) :: cval_str(257)
571#endif
572 integer :: major, minor
573 character(len=256) :: val_str
574
575 push_sub(accel_init.device_info)
576
577 call messages_new_line()
578 call messages_write('Selected device:')
579 call messages_new_line()
580
581#ifdef HAVE_CUDA
582#ifdef __HIP_PLATFORM_AMD__
583 call messages_write(' Framework : ROCm')
584#else
585 call messages_write(' Framework : CUDA')
586#endif
587#endif
588 call messages_info()
589
590#ifdef HAVE_CUDA
591 call messages_write(' Device type : GPU', new_line = .true.)
592#ifdef __HIP_PLATFORM_AMD__
593 call messages_write(' Device vendor : AMD Corporation', new_line = .true.)
594#else
595 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.)
596#endif
597#endif
598
599#ifdef HAVE_CUDA
600 cval_str = c_null_char
601 call cuda_device_name(accel%device%cuda_device, cval_str)
602 call string_c_to_f(cval_str, val_str)
603#endif
604 call messages_write(' Device name : '//trim(val_str))
605 call messages_new_line()
606
607#ifdef HAVE_CUDA
608 call cuda_device_capability(accel%device%cuda_device, major, minor)
609#endif
610 call messages_write(' Cuda capabilities :')
611 call messages_write(major, fmt = '(i2)')
612 call messages_write('.')
613 call messages_write(minor, fmt = '(i1)')
614 call messages_new_line()
615
616 ! VERSION
617#ifdef HAVE_CUDA
618 call cuda_driver_version(version)
619 call messages_write(' Driver version : ')
620 call messages_write(version)
621#endif
622 call messages_new_line()
623
624
625 call messages_write(' Device memory :')
626 call messages_write(accel%global_memory_size, units=unit_megabytes)
628
629 call messages_write(' Local/shared memory :')
630 call messages_write(accel%local_memory_size, units=unit_kilobytes)
631 call messages_new_line()
632
633 call messages_write(' Max. group/block size :')
634 call messages_write(accel%max_workgroup_size)
635 call messages_new_line()
636
637 call messages_info()
638
639 pop_sub(accel_init.device_info)
640 end subroutine device_info
641
642 end subroutine accel_init
643
644 ! ------------------------------------------
645 subroutine accel_end(namespace)
646 type(namespace_t), intent(in) :: namespace
647
648 integer(int64) :: hits, misses
649 real(real64) :: volume_hits, volume_misses
650 logical :: found
651 type(accel_mem_t) :: tmp
652
653 push_sub(accel_end)
654
655 if (accel_is_enabled()) then
656
657 ! Release global buffers
658 call accel_release_buffer(zm_0_buffer)
659 call accel_release_buffer(zm_1_buffer)
660 call accel_release_buffer(dm_0_buffer)
661 call accel_release_buffer(dm_1_buffer)
662
663 do
664 call alloc_cache_get(memcache, alloc_cache_any_size, found, tmp%mem)
665 if (.not. found) exit
666
667#ifdef HAVE_CUDA
668 call cuda_mem_free(tmp%mem)
669#endif
670 end do
671
672 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
673
674 call messages_print_with_emphasis(msg="Acceleration-device allocation cache", namespace=namespace)
675
676 call messages_new_line()
677 call messages_write(' Number of allocations =')
678 call messages_write(hits + misses, new_line = .true.)
679 call messages_write(' Volume of allocations =')
680 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
681 new_line = .true.)
682 call messages_write(' Hit ratio =')
683 if (hits + misses > 0) then
684 call messages_write(hits/real(hits + misses, real64)*100, fmt='(f6.1)', align_left = .true.)
685 else
686 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
687 end if
688 call messages_write('%', new_line = .true.)
689 call messages_write(' Volume hit ratio =')
690 if (volume_hits + volume_misses > 0) then
691 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
692 else
693 call messages_write(m_zero, fmt='(f6.1)', align_left = .true.)
694 end if
695 call messages_write('%')
696 call messages_new_line()
697 call messages_info()
698
699 call messages_print_with_emphasis(namespace=namespace)
700 end if
701
703
704 if (accel_is_enabled()) then
705#ifdef HAVE_CUDA
706 call cublas_end(accel%cublas_handle)
707 if (.not. accel%cuda_mpi) then ! CUDA aware MPI finalize will do the cleanup
708 call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
709 end if
710#endif
711
712 if (buffer_alloc_count /= 0) then
713 call messages_write('Accel:')
714 call messages_write(real(allocated_mem, real64) , fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
715 call messages_write(' in ')
717 call messages_write(' buffers were not deallocated.')
718 call messages_fatal()
719 end if
720
721 end if
722
723 pop_sub(accel_end)
724 end subroutine accel_end
725
726 ! ------------------------------------------
727
728 integer(int64) function accel_padded_size_i8(nn) result(psize)
729 integer(int64), intent(in) :: nn
730
731 integer(int64) :: modnn, bsize
732
733 psize = nn
734
735 if (accel_is_enabled()) then
736
738
739 psize = nn
740 modnn = mod(nn, bsize)
741 if (modnn /= 0) psize = psize + bsize - modnn
742
743 end if
744
745 end function accel_padded_size_i8
746
747 ! ------------------------------------------
748
749 integer(int32) function accel_padded_size_i4(nn) result(psize)
750 integer(int32), intent(in) :: nn
751
752 psize = int(accel_padded_size_i8(int(nn, int64)), int32)
753
754 end function accel_padded_size_i4
755
756 ! ------------------------------------------
757
758 subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
759 type(accel_mem_t), intent(inout) :: this
760 integer, intent(in) :: flags
761 type(type_t), intent(in) :: type
762 integer, intent(in) :: size
763 logical, optional, intent(in) :: set_zero
764 logical, optional, intent(in) :: async
765
766 call accel_create_buffer_8(this, flags, type, int(size, int64), set_zero, async)
768
769 ! ------------------------------------------
770
771 subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
772 type(accel_mem_t), intent(inout) :: this
773 integer, intent(in) :: flags
774 type(type_t), intent(in) :: type
775 integer(int64), intent(in) :: size
776 logical, optional, intent(in) :: set_zero
777 logical, optional, intent(in) :: async
778
779 integer(int64) :: fsize
780 logical :: found
781 integer(int64) :: initialize_buffers
782
783 push_sub(accel_create_buffer_8)
784
785 this%type = type
786 this%size = size
787 this%flags = flags
788 fsize = int(size, int64)*types_get_size(type)
789 this%allocated = .true.
790
791 if (fsize > 0) then
792
793 call alloc_cache_get(memcache, fsize, found, this%mem)
794
795 if (.not. found) then
796#ifdef HAVE_CUDA
797 if(optional_default(async, .false.)) then
798 call cuda_mem_alloc_async(this%mem, fsize)
799 else
800 call cuda_mem_alloc(this%mem, fsize)
801 end if
802#endif
803 end if
804
807
808 end if
809
810 if (present(set_zero)) then
811 initialize_buffers = merge(option__initializegpubuffers__yes, option__initializegpubuffers__no, set_zero)
812 else
813 initialize_buffers = accel%initialize_buffers
814 end if
815 select case (initialize_buffers)
816 case (option__initializegpubuffers__yes)
817 call accel_set_buffer_to(this, type, int(z'00', int8), size)
818 case (option__initializegpubuffers__nan)
819 call accel_set_buffer_to(this, type, int(z'FF', int8), size)
820 end select
821
822 pop_sub(accel_create_buffer_8)
823 end subroutine accel_create_buffer_8
824
825 ! ------------------------------------------
826
827 subroutine accel_release_buffer(this, async)
828 type(accel_mem_t), intent(inout) :: this
829 logical, optional, intent(in) :: async
830
831 logical :: put
832 integer(int64) :: fsize
833
834 push_sub(accel_release_buffer)
835
836 if (this%size > 0) then
837
838 fsize = int(this%size, int64)*types_get_size(this%type)
839
840 call alloc_cache_put(memcache, fsize, this%mem, put)
841
842 if (.not. put) then
843#ifdef HAVE_CUDA
844 if (optional_default(async, .false.)) then
845 call cuda_mem_free_async(this%mem)
846 else
847 call cuda_mem_free(this%mem)
848 end if
849#endif
850 end if
851
854
855 end if
856
857 this%size = 0
858 this%flags = 0
859
860 this%allocated = .false.
861
862 pop_sub(accel_release_buffer)
863 end subroutine accel_release_buffer
864
865 ! ------------------------------------------------------
867 ! Check if the temporary buffers are the right size, if not reallocate them
868 subroutine accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
869 type(accel_mem_t), intent(inout) :: buffer
870 integer, intent(in) :: flags
871 type(type_t), intent(in) :: type
872 integer, intent(in) :: required_size
873 logical, intent(in) :: set_zero
874 logical, optional, intent(in) :: async
875
877
878
879 if (accel_buffer_is_allocated(buffer) .and. buffer%size < required_size) then
880 call accel_release_buffer(buffer, async=optional_default(async, .false.))
881 end if
882
883 if (.not. accel_buffer_is_allocated(buffer)) then
884 call accel_create_buffer(buffer, flags, type, required_size, set_zero=set_zero, async=optional_default(async, .false.))
885 end if
886
888 end subroutine accel_ensure_buffer_size
889
890 ! ------------------------------------------
891
892 logical pure function accel_buffer_is_allocated(this) result(allocated)
893 type(accel_mem_t), intent(in) :: this
894
895 allocated = this%allocated
896 end function accel_buffer_is_allocated
897
898 ! -----------------------------------------
899
900 subroutine accel_finish()
901 ! no push_sub, called too frequently
902
903 if (accel_is_enabled()) then
904#ifdef HAVE_CUDA
906#endif
907 end if
908 end subroutine accel_finish
909
910 ! ------------------------------------------
911
912 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
913 type(accel_kernel_t), intent(inout) :: kernel
914 integer, intent(in) :: narg
915 type(accel_mem_t), intent(in) :: buffer
916
917 assert(accel_buffer_is_allocated(buffer))
918
919 ! no push_sub, called too frequently
920#ifdef HAVE_CUDA
921 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
922#endif
923
924 end subroutine accel_set_kernel_arg_buffer
925
926 ! ------------------------------------------
927
928 subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
929 type(accel_kernel_t), intent(inout) :: kernel
930 integer, intent(in) :: narg
931 type(type_t), intent(in) :: type
932 integer, intent(in) :: size
933
934 integer(int64) :: size_in_bytes
935
937
938
939 size_in_bytes = int(size, int64)*types_get_size(type)
940
941 if (size_in_bytes > accel%local_memory_size) then
942 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", real(size_in_bytes, real64) /1024.0, " Kb"
943 write(message(2), '(a,f12.6,a)') " available local memory: ", real(accel%local_memory_size, real64) /1024.0, " Kb"
944 call messages_fatal(2)
945 else if (size_in_bytes <= 0) then
946 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
947 call messages_fatal(1)
948 end if
949
950#ifdef HAVE_CUDA
951 kernel%cuda_shared_mem = size_in_bytes
952#endif
953
955 end subroutine accel_set_kernel_arg_local
956
957 ! ------------------------------------------
958
959 subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
960 type(accel_kernel_t), intent(inout) :: kernel
961 integer(int64), intent(in) :: globalsizes(:)
962 integer(int64), intent(in) :: localsizes(:)
963
964 integer :: dim
965 integer(int64) :: gsizes(1:3)
966 integer(int64) :: lsizes(1:3)
968 ! no push_sub, called too frequently
969
970 ! cuda needs all dimensions
971 gsizes = 1
972 lsizes = 1
973
974 dim = ubound(globalsizes, dim=1)
975
976 assert(dim == ubound(localsizes, dim=1))
977
978 ! if one size is zero, there is nothing to do
979 if (any(globalsizes == 0)) return
980
981 assert(all(localsizes > 0))
982 assert(all(localsizes <= accel_max_workgroup_size()))
983 assert(all(mod(globalsizes, localsizes) == 0))
984
985 gsizes(1:dim) = globalsizes(1:dim)
986 lsizes(1:dim) = localsizes(1:dim)
987
988#ifdef HAVE_CUDA
989 ! Maximum dimension of a block
990 if (any(lsizes(1:3) > accel%max_block_dim(1:3))) then
991 message(1) = "Maximum dimension of a block too large in kernel "//trim(kernel%kernel_name)
992 message(2) = "The following conditions should be fulfilled:"
993 write(message(3), "(A, I8, A, I8)") "Dim 1: ", lsizes(1), " <= ", accel%max_block_dim(1)
994 write(message(4), "(A, I8, A, I8)") "Dim 2: ", lsizes(2), " <= ", accel%max_block_dim(2)
995 write(message(5), "(A, I8, A, I8)") "Dim 3: ", lsizes(3), " <= ", accel%max_block_dim(3)
996 message(6) = "This is an internal error, please contact the developers."
997 call messages_fatal(6)
998 end if
999
1000
1001 ! Maximum number of threads per block
1002 if (product(lsizes) > accel_max_workgroup_size()) then
1003 message(1) = "Maximum number of threads per block too large in kernel "//trim(kernel%kernel_name)
1004 message(2) = "The following condition should be fulfilled:"
1005 write(message(3), "(I8, A, I8)") product(lsizes), " <= ", accel_max_workgroup_size()
1006 message(4) = "This is an internal error, please contact the developers."
1007 call messages_fatal(4)
1008 end if
1009
1010 gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1011
1012 ! Maximum dimensions of the grid of thread block
1013 if (any(gsizes(1:3) > accel%max_grid_dim(1:3))) then
1014 message(1) = "Maximum dimension of grid too large in kernel "//trim(kernel%kernel_name)
1015 message(2) = "The following conditions should be fulfilled:"
1016 write(message(3), "(A, I8, A, I10)") "Dim 1: ", gsizes(1), " <= ", accel%max_grid_dim(1)
1017 write(message(4), "(A, I8, A, I10)") "Dim 2: ", gsizes(2), " <= ", accel%max_grid_dim(2)
1018 write(message(5), "(A, I8, A, I10)") "Dim 3: ", gsizes(3), " <= ", accel%max_grid_dim(3)
1019 message(6) = "This is an internal error, please contact the developers."
1020 call messages_fatal(6)
1021 end if
1022
1023 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1024
1025 kernel%cuda_shared_mem = 0
1026#endif
1027
1028 end subroutine accel_kernel_run_8
1029
1030 ! -----------------------------------------------
1032 subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
1033 type(accel_kernel_t), intent(inout) :: kernel
1034 integer, intent(in) :: globalsizes(:)
1035 integer, intent(in) :: localsizes(:)
1036
1037 call accel_kernel_run_8(kernel, int(globalsizes, int64), int(localsizes, int64))
1038
1039 end subroutine accel_kernel_run_4
1040
1041 ! -----------------------------------------------
1043 integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1044 max_workgroup_size = accel%max_workgroup_size
1045 end function accel_max_workgroup_size
1046
1047 ! -----------------------------------------------
1049 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1050 type(accel_kernel_t), intent(inout) :: kernel
1051
1052#ifdef HAVE_CUDA
1053 integer :: max_workgroup_size
1054#endif
1055
1056 workgroup_size = 0
1057
1058#ifdef HAVE_CUDA
1059 call cuda_kernel_max_threads_per_block(kernel%cuda_kernel, max_workgroup_size)
1060 if (debug%info .and. max_workgroup_size /= accel%max_workgroup_size) then
1061 write(message(1), "(A, I5, A)") "A kernel can use only less threads per block (", workgroup_size, ")", &
1062 "than available on the device (", accel%max_workgroup_size, ")"
1063 call messages_info(1)
1064 end if
1065 ! recommended number of threads per block is 256 according to the CUDA best practice guide
1066 ! see https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#thread-and-block-heuristics
1067 workgroup_size = 256
1068 ! make sure we do not use more threads per block than available for this kernel
1069 workgroup_size = min(workgroup_size, max_workgroup_size)
1070#endif
1071
1072 end function accel_kernel_workgroup_size
1073
1074 ! ----------------------------------------------------
1075
1076 subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
1077 type(accel_mem_t), intent(inout) :: buffer
1078 type(type_t), intent(in) :: type
1079 integer(int8), intent(in) :: val
1080 integer(int64), intent(in) :: nval
1081 integer(int64), optional, intent(in) :: offset
1082 logical, optional, intent(in) :: async
1083
1084 integer(int64) :: nval_, offset_, type_size
1085
1086 push_sub(accel_set_buffer_to)
1087
1088 if (nval == 0) then
1089 pop_sub(accel_set_buffer_to)
1090 return
1091 end if
1092 assert(nval > 0)
1093
1094 if (present(offset)) then
1095 assert(offset >= 0)
1096 if(offset > buffer%size) then
1097 pop_sub(accel_set_buffer_to)
1098 return
1099 end if
1100 end if
1101
1102 type_size = types_get_size(type)
1103
1104 nval_ = nval*type_size
1105
1106 offset_ = 0_int64
1107 if (present(offset)) offset_ = offset*type_size
1108
1109 call cuda_mem_set_async(buffer%mem, val, nval_, offset_)
1110 if(.not. optional_default(async, .false.)) call accel_finish()
1111
1112 pop_sub(accel_set_buffer_to)
1113 end subroutine accel_set_buffer_to
1114
1115 ! ----------------------------------------------------
1116
1117 subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
1118 type(accel_mem_t), intent(inout) :: buffer
1119 type(type_t), intent(in) :: type
1120 integer(int64), intent(in) :: nval
1121 integer(int64), optional, intent(in) :: offset
1122 logical, optional, intent(in) :: async
1123
1125
1126 call accel_set_buffer_to(buffer, type, int(z'00', int8), nval, offset, async)
1127
1129 end subroutine accel_set_buffer_to_zero_i8
1130
1131 ! ----------------------------------------------------
1132
1133 subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
1134 type(accel_mem_t), intent(inout) :: buffer
1135 type(type_t), intent(in) :: type
1136 integer(int32), intent(in) :: nval
1137 integer(int32), optional, intent(in) :: offset
1138 logical, optional, intent(in) :: async
1139
1141
1142 if (present(offset)) then
1143 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), int(offset, int64), async=async)
1144 else
1145 call accel_set_buffer_to_zero_i8(buffer, type, int(nval, int64), async=async)
1146 end if
1147
1149 end subroutine accel_set_buffer_to_zero_i4
1150
1151 ! ----------------------------------------------------
1152
1153 subroutine accel_check_bandwidth()
1154 integer :: itime
1155 integer, parameter :: times = 10
1156 integer :: size
1157 real(real64) :: time, stime
1158 real(real64) :: read_bw, write_bw
1159 type(accel_mem_t) :: buff
1160 real(real64), allocatable :: data(:)
1161
1162 call messages_new_line()
1163 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1164 call messages_new_line()
1165 call messages_info()
1166
1167 call messages_write(' Buffer size Read bw Write bw')
1168 call messages_new_line()
1169 call messages_write(' [MiB] [MiB/s] [MiB/s]')
1170 call messages_info()
1171
1172 size = 15000
1173 do
1174 safe_allocate(data(1:size))
1175 call accel_create_buffer(buff, accel_mem_read_write, type_float, size)
1176
1177 stime = loct_clock()
1178 do itime = 1, times
1179 call accel_write_buffer(buff, size, data)
1180 call accel_finish()
1181 end do
1182 time = (loct_clock() - stime)/real(times, real64)
1183
1184 write_bw = real(size, real64) *8.0_real64/time
1185
1186 stime = loct_clock()
1187 do itime = 1, times
1188 call accel_read_buffer(buff, size, data)
1189 end do
1190 call accel_finish()
1191
1192 time = (loct_clock() - stime)/real(times, real64)
1193 read_bw = real(size, real64) *8.0_real64/time
1194
1195 call messages_write(size*8.0_real64/1024.0_real64**2)
1196 call messages_write(write_bw/1024.0_real64**2, fmt = '(f10.1)')
1197 call messages_write(read_bw/1024.0_real64**2, fmt = '(f10.1)')
1198 call messages_info()
1199
1200 call accel_release_buffer(buff)
1201
1202 safe_deallocate_a(data)
1203
1204 size = int(size*2.0)
1206 if (size > 50000000) exit
1207 end do
1208 end subroutine accel_check_bandwidth
1209
1210 ! ----------------------------------------------------
1211
1212 logical pure function accel_use_shared_mem() result(use_shared_mem)
1213
1214 use_shared_mem = accel%shared_mem
1215
1216 end function accel_use_shared_mem
1217
1218 !------------------------------------------------------------
1219
1220 subroutine accel_kernel_global_init()
1221
1222 push_sub(accel_kernel_global_init)
1223
1224 nullify(head)
1225
1226 call cuda_module_map_init(accel%module_map)
1227
1229 end subroutine accel_kernel_global_init
1230
1231 !------------------------------------------------------------
1232
1233 subroutine accel_kernel_global_end()
1234 type(accel_kernel_t), pointer :: next_head
1235
1236 push_sub(accel_kernel_global_end)
1237
1238 do
1239 if (.not. associated(head)) exit
1240 next_head => head%next
1242 head => next_head
1243 end do
1244
1245 if (accel_is_enabled()) then
1246 call cuda_module_map_end(accel%module_map)
1247 end if
1248
1250 end subroutine accel_kernel_global_end
1251
1252 !------------------------------------------------------------
1253
1254 subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1255 type(accel_kernel_t), intent(inout) :: this
1256 character(len=*), intent(in) :: file_name
1257 character(len=*), intent(in) :: kernel_name
1258 character(len=*), optional, intent(in) :: flags
1259
1260#ifdef HAVE_CUDA
1261 character(len=1000) :: all_flags
1262#endif
1264 push_sub(accel_kernel_build)
1265
1266 call profiling_in("ACCEL_COMPILE", exclude = .true.)
1267
1268#ifdef HAVE_CUDA
1269 all_flags = '-I'//trim(conf%share)//'/kernels/'//" "//trim(accel%debug_flag)
1270
1271 if (accel_use_shared_mem()) then
1272 all_flags = trim(all_flags)//' -DSHARED_MEM'
1273 end if
1274
1275 if (present(flags)) then
1276 all_flags = trim(all_flags)//' '//trim(flags)
1277 end if
1278
1279 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, &
1280 string_f_to_c(trim(file_name)), string_f_to_c(trim(all_flags)))
1282 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, string_f_to_c(trim(kernel_name)))
1283 call cuda_alloc_arg_array(this%arguments)
1284
1285 this%cuda_shared_mem = 0
1286#endif
1287
1288 this%initialized = .true.
1289 this%kernel_name = trim(kernel_name)
1290
1291 call profiling_out("ACCEL_COMPILE")
1292
1293 pop_sub(accel_kernel_build)
1294 end subroutine accel_kernel_build
1295
1296 !------------------------------------------------------------
1297
1298 subroutine accel_kernel_end(this)
1299 type(accel_kernel_t), intent(inout) :: this
1301 push_sub(accel_kernel_end)
1302
1303#ifdef HAVE_CUDA
1304 call cuda_free_arg_array(this%arguments)
1305 call cuda_release_kernel(this%cuda_kernel)
1306 ! modules are not released here, since they are not associated to a kernel
1307#endif
1309 this%initialized = .false.
1310
1311 pop_sub(accel_kernel_end)
1312 end subroutine accel_kernel_end
1313
1314 !------------------------------------------------------------
1315
1316 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1317 type(accel_kernel_t), target, intent(inout) :: this
1318 character(len=*), intent(in) :: file_name
1319 character(len=*), intent(in) :: kernel_name
1320 character(len=*), optional, intent(in) :: flags
1321
1322 push_sub(accel_kernel_start_call)
1323
1324 if (.not. this%initialized) then
1325 call accel_kernel_build(this, file_name, kernel_name, flags)
1326 this%next => head
1327 head => this
1328 end if
1329
1331 end subroutine accel_kernel_start_call
1332
1333 !--------------------------------------------------------------
1334
1335 integer(int64) pure function accel_global_memory_size() result(size)
1336
1337 size = accel%global_memory_size
1338
1339 end function accel_global_memory_size
1340
1341 !--------------------------------------------------------------
1342
1343 integer(int64) pure function accel_local_memory_size() result(size)
1344
1345 size = accel%local_memory_size
1346
1347 end function accel_local_memory_size
1348
1349 !--------------------------------------------------------------
1350
1351 integer pure function accel_max_size_per_dim(dim) result(size)
1352 integer, intent(in) :: dim
1353
1354 size = 0
1355#ifdef HAVE_CUDA
1356 size = 32768
1357 if (dim == 1) size = 2**30
1358#endif
1359 end function accel_max_size_per_dim
1360
1361 ! ------------------------------------------------------
1362
1363 subroutine accel_set_stream(stream_number)
1364 integer, intent(in) :: stream_number
1365
1366 push_sub(accel_set_stream)
1367
1368 if (accel_is_enabled()) then
1369#ifdef HAVE_CUDA
1370 call cuda_set_stream(accel%cuda_stream, stream_number)
1371 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1372#endif
1373 end if
1374
1375 pop_sub(accel_set_stream)
1376 end subroutine accel_set_stream
1377
1378 ! ------------------------------------------------------
1379
1380 subroutine accel_get_stream(stream_number)
1381 integer, intent(inout) :: stream_number
1382
1383 push_sub(accel_get_stream)
1384
1385 if (accel_is_enabled()) then
1386#ifdef HAVE_CUDA
1387 call cuda_get_stream(stream_number)
1388#endif
1389 end if
1390
1391 pop_sub(accel_get_stream)
1392 end subroutine accel_get_stream
1393
1394 ! ------------------------------------------------------
1395
1398
1399 if (accel_is_enabled()) then
1400#ifdef HAVE_CUDA
1401 call cuda_synchronize_all_streams()
1402#endif
1403 end if
1404
1406 end subroutine accel_synchronize_all_streams
1407
1408 function daccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1409 type(c_ptr), intent(in) :: buffer
1410 integer(int64), intent(in) :: offset
1411 type(c_ptr) :: buffer_offset
1412
1414#ifdef HAVE_CUDA
1415 call cuda_get_pointer_with_offset(buffer, offset, buffer_offset)
1416#else
1417 ! this is needed to make the compiler happy for non-GPU compilations
1418 buffer_offset = buffer
1419#endif
1422
1423 function zaccel_get_pointer_with_offset(buffer, offset) result(buffer_offset)
1424 type(c_ptr), intent(in) :: buffer
1425 integer(int64), intent(in) :: offset
1426 type(c_ptr) :: buffer_offset
1427
1429#ifdef HAVE_CUDA
1430 call cuda_get_pointer_with_offset(buffer, 2_int64*offset, buffer_offset)
1431#else
1432 ! this is needed to make the compiler happy for non-GPU compilations
1433 buffer_offset = buffer
1434#endif
1437
1438 subroutine accel_clean_pointer(buffer)
1439 type(c_ptr), intent(in) :: buffer
1440
1441 push_sub(accel_clean_pointer)
1442#ifdef HAVE_CUDA
1443 call cuda_clean_pointer(buffer)
1444#endif
1445 pop_sub(accel_clean_pointer)
1446 end subroutine accel_clean_pointer
1447
1451 subroutine accel_get_unfolded_size(size, grid_size, thread_block_size)
1452 integer(int64), intent(in) :: size
1453 integer(int64), intent(out) :: grid_size
1454 integer(int64), intent(out) :: thread_block_size
1455
1456 push_sub(accel_get_unfolded_size)
1457#ifdef __HIP_PLATFORM_AMD__
1458 ! not benefitial for AMD chips
1459 grid_size = size
1460 thread_block_size = size
1461#else
1462 grid_size = size * accel%warp_size
1463 thread_block_size = accel%warp_size
1464#endif
1466 end subroutine accel_get_unfolded_size
1467
1468#include "undef.F90"
1469#include "real.F90"
1470#include "accel_inc.F90"
1471
1472#include "undef.F90"
1473#include "complex.F90"
1474#include "accel_inc.F90"
1475
1476#include "undef.F90"
1477#include "integer.F90"
1478#include "accel_inc.F90"
1479
1480#include "undef.F90"
1481#include "integer8.F90"
1482#include "accel_inc.F90"
1483
1484end module accel_oct_m
1485
1486!! Local Variables:
1487!! mode: f90
1488!! coding: utf-8
1489!! End:
subroutine device_info()
Definition: accel.F90:628
subroutine laccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4253
subroutine zaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2784
type(accel_kernel_t), target, save, public dkernel_batch_axpy
Definition: accel.F90:279
subroutine, public accel_clean_pointer(buffer)
Definition: accel.F90:1404
subroutine accel_kernel_global_end()
Definition: accel.F90:1219
subroutine zaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2328
subroutine, public accel_get_unfolded_size(size, grid_size, thread_block_size)
Get unfolded size: some kernels (e.g. projectors) unfold the array across warps as an optimization....
Definition: accel.F90:1417
subroutine laccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3994
subroutine iaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3020
pure logical function, public accel_allow_cpu_only()
Definition: accel.F90:390
subroutine daccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:1970
logical pure function, public accel_use_shared_mem()
Definition: accel.F90:1198
subroutine zaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2693
subroutine zaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2829
subroutine laccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3775
subroutine daccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1706
subroutine laccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4140
subroutine zaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2457
subroutine zaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2225
subroutine daccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1793
integer buffer_alloc_count
Definition: accel.F90:372
subroutine iaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3529
subroutine zaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2516
subroutine laccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4164
subroutine iaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3152
type(accel_kernel_t), target, save, public zkernel_batch_axpy
Definition: accel.F90:280
subroutine zaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2547
subroutine iaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3001
subroutine zaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2856
subroutine iaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3075
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1282
subroutine iaccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3051
subroutine iaccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3579
subroutine iaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3371
subroutine zaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2390
subroutine, public accel_get_stream(stream_number)
Definition: accel.F90:1346
subroutine accel_create_buffer_4(this, flags, type, size, set_zero, async)
Definition: accel.F90:798
subroutine zaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2590
integer(int64) pure function, public accel_global_memory_size()
Definition: accel.F90:1301
subroutine laccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3725
type(accel_kernel_t), target, save, public zkernel_dot_matrix_spinors
Definition: accel.F90:278
type(accel_kernel_t), target, save, public zkernel_ax_function_py
Definition: accel.F90:282
subroutine daccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1774
subroutine daccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:1574
subroutine zaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:2677
subroutine daccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2083
subroutine iaccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3270
subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
Definition: accel.F90:968
subroutine daccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2061
integer(int64) function accel_padded_size_i8(nn)
Definition: accel.F90:768
subroutine accel_check_bandwidth()
Definition: accel.F90:1139
subroutine laccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3904
subroutine daccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1515
subroutine iaccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:3552
subroutine zaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2571
subroutine iaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3484
subroutine, public accel_finish()
Definition: accel.F90:940
subroutine accel_kernel_global_init()
Definition: accel.F90:1206
subroutine zaccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2648
integer(int64) allocated_mem
Definition: accel.F90:373
subroutine zaccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2278
subroutine iaccel_get_device_pointer_1(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3416
subroutine laccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4276
subroutine, public accel_ensure_buffer_size(buffer, flags, type, required_size, set_zero, async)
Definition: accel.F90:908
subroutine accel_set_buffer_to(buffer, type, val, nval, offset, async)
Definition: accel.F90:1062
subroutine laccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:4124
subroutine daccel_write_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:1555
subroutine daccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1848
subroutine zaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:2497
subroutine daccel_read_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1824
subroutine laccel_write_buffer_single(this, data, async)
Definition: accel.F90:3672
subroutine daccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:1994
subroutine iaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3332
subroutine zaccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:2352
subroutine zaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:2371
subroutine accel_set_buffer_to_zero_i8(buffer, type, nval, offset, async)
Definition: accel.F90:1103
subroutine zaccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2761
logical pure function, public accel_buffer_is_allocated(this)
Definition: accel.F90:932
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine daccel_create_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2106
subroutine accel_kernel_end(this)
Definition: accel.F90:1264
type(accel_kernel_t), target, save, public dkernel_ax_function_py
Definition: accel.F90:281
subroutine laccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:3876
subroutine zaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2717
subroutine daccel_write_buffer_3(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1605
subroutine laccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:4056
type(c_ptr) function, public daccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1374
subroutine iaccel_write_buffer_single(this, data, async)
Definition: accel.F90:2948
subroutine iaccel_get_device_pointer_2(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3440
integer pure function, public accel_max_size_per_dim(dim)
Definition: accel.F90:1317
subroutine iaccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3180
subroutine daccel_read_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:1734
subroutine iaccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3313
subroutine laccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3799
subroutine zaccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:2297
subroutine laccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:4018
subroutine laccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4231
subroutine iaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3462
subroutine zaccel_write_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:2429
subroutine daccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:1954
subroutine iaccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3220
subroutine accel_kernel_run_8(kernel, globalsizes, localsizes)
Definition: accel.F90:999
subroutine, public accel_kernel_build(this, file_name, kernel_name, flags)
Definition: accel.F90:1240
subroutine, public accel_init(base_grp, namespace)
Definition: accel.F90:400
subroutine, public accel_end(namespace)
Definition: accel.F90:685
subroutine laccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:3685
subroutine, public accel_synchronize_all_streams()
Definition: accel.F90:1362
subroutine, public accel_set_stream(stream_number)
Definition: accel.F90:1329
subroutine, public accel_release_buffer(this, async)
Definition: accel.F90:867
subroutine laccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3963
subroutine daccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:2133
subroutine laccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4208
subroutine iaccel_read_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:3294
integer(int32) function accel_padded_size_i4(nn)
Definition: accel.F90:789
subroutine accel_set_buffer_to_zero_i4(buffer, type, nval, offset, async)
Definition: accel.F90:1119
subroutine laccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3818
subroutine iaccel_get_device_pointer_2l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:3507
subroutine iaccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3113
pure logical function, public accel_is_enabled()
Definition: accel.F90:380
type(alloc_cache_t) memcache
Definition: accel.F90:375
subroutine daccel_write_buffer_0_int32(this, n1, data, offset, async)
Definition: accel.F90:1629
subroutine daccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1667
subroutine iaccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:3094
integer, parameter, public accel_mem_write_only
Definition: accel.F90:183
subroutine daccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:1925
subroutine laccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:4037
subroutine daccel_read_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1867
subroutine daccel_get_device_pointer_1l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2038
subroutine accel_kernel_run_4(kernel, globalsizes, localsizes)
Definition: accel.F90:1032
subroutine laccel_release_blas_alpha_beta_buffer(this, data, async)
Definition: accel.F90:4303
subroutine iaccel_read_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3239
subroutine laccel_write_buffer_2(this, n1, n2, data, offset, async)
Definition: accel.F90:3744
subroutine laccel_write_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:3837
subroutine zaccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2739
type(c_ptr) function, public zaccel_get_pointer_with_offset(buffer, offset)
Definition: accel.F90:1389
subroutine laccel_read_buffer_3_int32(this, n1, n2, n3, data, offset, async)
Definition: accel.F90:4095
subroutine daccel_write_buffer_single(this, data, async)
Definition: accel.F90:1502
subroutine daccel_write_buffer_1_int32(this, n1, data, offset, async)
Definition: accel.F90:1648
integer function, public accel_kernel_workgroup_size(kernel)
Definition: accel.F90:1049
subroutine zaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2238
type(accel_t), public accel
Definition: accel.F90:250
subroutine laccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:4186
subroutine iaccel_set_kernel_arg_data(kernel, narg, data)
Definition: accel.F90:3400
subroutine daccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:1886
subroutine iaccel_write_buffer_0(this, n1, data, offset, async)
Definition: accel.F90:2961
subroutine zaccel_get_device_pointer_3l(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2806
subroutine accel_create_buffer_8(this, flags, type, size, set_zero, async)
Definition: accel.F90:811
subroutine laccel_read_buffer_1(this, n1, data, offset, async)
Definition: accel.F90:3944
integer(int64) pure function, public accel_local_memory_size()
Definition: accel.F90:1309
subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
Definition: accel.F90:952
integer pure function, public accel_max_workgroup_size()
Definition: accel.F90:1043
subroutine zaccel_read_buffer_2_int32(this, n1, n2, data, offset, async)
Definition: accel.F90:2609
subroutine daccel_get_device_pointer_3(host_pointer, device_pointer, dimensions)
Definition: accel.F90:2016
type(accel_kernel_t), pointer head
Definition: accel.F90:374
subroutine, public alloc_cache_put(alloc_cache, size, loc, put)
subroutine, public alloc_cache_get(alloc_cache, size, found, loc)
integer(int64), parameter, public alloc_cache_any_size
real(real64), parameter, public m_zero
Definition: global.F90:191
complex(real64), parameter, public m_z0
Definition: global.F90:201
complex(real64), parameter, public m_z1
Definition: global.F90:202
real(real64), parameter, public m_one
Definition: global.F90:192
System information (time, memory, sysname)
Definition: loct.F90:117
subroutine string_c_to_f(c_string, f_string)
convert a C string to a Fortran string
Definition: loct.F90:258
subroutine, public loct_sysname(name)
Definition: loct.F90:332
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_print_with_emphasis(msg, iunit, namespace)
Definition: messages.F90:898
character(len=512), private msg
Definition: messages.F90:167
subroutine, public messages_warning(no_lines, all_nodes, namespace)
Definition: messages.F90:525
subroutine, public messages_obsolete_variable(namespace, name, rep)
Definition: messages.F90:1023
subroutine, public messages_new_line()
Definition: messages.F90:1112
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public messages_input_error(namespace, var, details, row, column)
Definition: messages.F90:691
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:594
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_cmplx
Definition: types.F90:136
integer pure function, public types_get_size(this)
Definition: types.F90:154
This module defines the unit system, used for input and output.
type(unit_t), public unit_gigabytes
For larger amounts of data (natural code units are bytes)
type(unit_t), public unit_megabytes
For large amounts of data (natural code units are bytes)
type(unit_t), public unit_kilobytes
For small amounts of data (natural code units are bytes)
int true(void)