Octopus
batch.F90
Go to the documentation of this file.
1!! Copyright (C) 2008 X. Andrade, 2020 S. Ohlmann
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
39!
40module batch_oct_m
41 use accel_oct_m
43 use blas_oct_m
44 use debug_oct_m
45 use global_oct_m
47 use iso_c_binding
48 use math_oct_m
50 use mpi_oct_m
52 use types_oct_m
53
54 implicit none
55
56 private
57 public :: &
58 batch_t, &
59 batch_init, &
63
66 type batch_t
67 private
68 integer, public :: nst
69 integer, public :: dim
70 integer :: np
71 integer :: ndims
72 integer, allocatable :: ist_idim_index(:, :)
73
77 integer, allocatable, public :: ist(:)
83
84 logical :: is_allocated
85 logical :: own_memory
86 ! We also need a linear array with the states in order to calculate derivatives, etc.
87 integer, public :: nst_linear
88
89 integer :: status_of
93 integer :: status_host
102 type(type_t) :: type_of
103 integer :: device_buffer_count
104 integer :: host_buffer_count
105 logical :: special_memory
106 logical :: needs_finish_unpack
107
108
109 ! unpacked variables; linear variables are pointers with different shapes
110 real(real64), pointer, contiguous, public :: dff(:, :, :)
111 ! !! indices are (1:np, 1:dim, 1:nst)
112 complex(real64), pointer, contiguous, public :: zff(:, :, :)
113 ! !! indices are (1:np, 1:dim, 1:nst)
114 real(real64), pointer, contiguous, public :: dff_linear(:, :)
115 ! !! indices are (1:np, 1:nst_linear)
116 complex(real64), pointer, contiguous, public :: zff_linear(:, :)
117 ! !! indices are (1:np, 1:nst_linear)
118
119 ! packed variables; only rank-2 arrays due to padding to powers of 2
120 real(real64), pointer, contiguous, public :: dff_pack(:, :)
121 ! !! indices are (1:nst_linear, 1:np)
122 complex(real64), pointer, contiguous, public :: zff_pack(:, :)
123 ! !! indices are (1:nst_linear, 1:np)
124
125 integer(int64), public :: pack_size(1:2)
127 integer(int64), public :: pack_size_real(1:2)
130
131 type(accel_mem_t), public :: ff_device
132
133 contains
134 procedure :: check_compatibility_with => batch_check_compatibility_with
135 procedure :: clone_to => batch_clone_to
136 procedure :: clone_to_array => batch_clone_to_array
137 procedure :: copy_to => batch_copy_to
138 procedure :: copy_data_to => batch_copy_data_to
139 procedure :: do_pack => batch_do_pack
140 procedure :: do_unpack => batch_do_unpack
141 procedure :: finish_unpack => batch_finish_unpack
142 procedure :: end => batch_end
143 procedure :: inv_index => batch_inv_index
144 procedure :: is_packed => batch_is_packed
145 procedure :: ist_idim_to_linear => batch_ist_idim_to_linear
146 procedure :: linear_to_idim => batch_linear_to_idim
147 procedure :: linear_to_ist => batch_linear_to_ist
148 procedure :: pack_total_size => batch_pack_total_size
149 procedure :: remote_access_start => batch_remote_access_start
150 procedure :: remote_access_stop => batch_remote_access_stop
151 procedure :: status => batch_status
152 procedure :: type => batch_type
153 procedure :: type_as_int => batch_type_as_integer
154 procedure, private :: dallocate_unpacked_host => dbatch_allocate_unpacked_host
155
156 procedure, private :: zallocate_unpacked_host => zbatch_allocate_unpacked_host
157
158 procedure, private :: allocate_unpacked_host => batch_allocate_unpacked_host
160 procedure, private :: dallocate_packed_host => dbatch_allocate_packed_host
162 procedure, private :: zallocate_packed_host => zbatch_allocate_packed_host
164 procedure, private :: allocate_packed_host => batch_allocate_packed_host
166 procedure, private :: allocate_packed_device => batch_allocate_packed_device
167
168 procedure, private :: deallocate_unpacked_host => batch_deallocate_unpacked_host
169
170 procedure, private :: deallocate_packed_host => batch_deallocate_packed_host
171
172 procedure, private :: deallocate_packed_device => batch_deallocate_packed_device
173
174 end type batch_t
175
176 !--------------------------------------------------------------
180 interface batch_init
181 module procedure dbatch_init_with_memory_3
183 module procedure dbatch_init_with_memory_2
184 module procedure zbatch_init_with_memory_2
185 module procedure dbatch_init_with_memory_1
187 end interface batch_init
188
189 integer, public, parameter :: &
190 batch_not_packed = 0, & !< functions are stored in CPU memory, unpacked order
191 batch_packed = 1, &
193
194 integer, parameter :: cl_pack_max_buffer_size = 4
197contains
199 !--------------------------------------------------------------
206 !
207 subroutine batch_end(this, copy)
208 class(batch_t), intent(inout) :: this
209 logical, optional, intent(in) :: copy
211
212 push_sub(batch_end)
214 if (this%own_memory .and. this%is_packed()) then
215 !deallocate directly to avoid unnecessary copies
216 if (this%status() == batch_device_packed) then
217 call this%deallocate_packed_device()
218 end if
219 if (this%status() == batch_packed .or. this%status_host == batch_packed) then
220 call this%deallocate_packed_host()
221 end if
222 this%status_of = batch_not_packed
223 this%status_host = batch_not_packed
224 this%host_buffer_count = 0
225 this%device_buffer_count = 0
226 end if
227 if (this%status() == batch_device_packed) call this%do_unpack(copy, force = .true.)
228 if (this%status() == batch_packed) call this%do_unpack(copy, force = .true.)
230 if (this%is_allocated) then
231 call this%deallocate_unpacked_host()
232 end if
234 safe_deallocate_a(this%ist_idim_index)
235 safe_deallocate_a(this%ist)
237 pop_sub(batch_end)
238 end subroutine batch_end
240 !--------------------------------------------------------------
246 class(batch_t), intent(inout) :: this
250 this%is_allocated = .false.
252 if (this%special_memory) then
253 if (associated(this%dff)) then
254 call deallocate_hardware_aware(c_loc(this%dff(1,1,1)), int(this%np, int64)*this%dim*this%nst*8)
255 end if
256 if (associated(this%zff)) then
257 call deallocate_hardware_aware(c_loc(this%zff(1,1,1)), int(this%np, int64)*this%dim*this%nst*16)
258 end if
259 else
260 safe_deallocate_p(this%dff)
261 safe_deallocate_p(this%zff)
262 end if
263 nullify(this%dff)
264 nullify(this%dff_linear)
265 nullify(this%zff)
266 nullify(this%zff_linear)
267
269 end subroutine batch_deallocate_unpacked_host
270
271 !--------------------------------------------------------------
275 !
276 subroutine batch_deallocate_packed_host(this)
277 class(batch_t), intent(inout) :: this
278
280
281 if (this%special_memory) then
282 if (associated(this%dff_pack)) then
283 call deallocate_hardware_aware(c_loc(this%dff_pack(1,1)), int(this%pack_size(1), int64)*this%pack_size(2)*8)
284 end if
285 if (associated(this%zff_pack)) then
286 call deallocate_hardware_aware(c_loc(this%zff_pack(1,1)), int(this%pack_size(1), int64)*this%pack_size(2)*16)
287 end if
288 else
289 safe_deallocate_p(this%dff_pack)
290 safe_deallocate_p(this%zff_pack)
291 end if
292 nullify(this%dff_pack)
293 nullify(this%zff_pack)
294
296 end subroutine batch_deallocate_packed_host
297
298 !--------------------------------------------------------------
301 subroutine batch_deallocate_packed_device(this)
302 class(batch_t), intent(inout) :: this
303
305
306 call accel_release_buffer(this%ff_device)
307
309 end subroutine batch_deallocate_packed_device
310
311 !--------------------------------------------------------------
315 !
316 subroutine batch_allocate_unpacked_host(this)
317 class(batch_t), intent(inout) :: this
318
320
321 if (this%type() == type_float) then
322 call this%dallocate_unpacked_host()
323 else if (this%type() == type_cmplx) then
324 call this%zallocate_unpacked_host()
325 end if
326
328 end subroutine batch_allocate_unpacked_host
329
330 !--------------------------------------------------------------
334 !
335 subroutine batch_allocate_packed_host(this)
336 class(batch_t), intent(inout) :: this
337
339
340 if (this%type() == type_float) then
341 call this%dallocate_packed_host()
342 else if (this%type() == type_cmplx) then
343 call this%zallocate_packed_host()
344 end if
345
347 end subroutine batch_allocate_packed_host
348
349 !--------------------------------------------------------------
353 !
354 subroutine batch_allocate_packed_device(this)
355 class(batch_t), intent(inout) :: this
356
358
359 call accel_create_buffer(this%ff_device, accel_mem_read_write, this%type(), &
360 product(this%pack_size))
361
363 end subroutine batch_allocate_packed_device
364
365 !--------------------------------------------------------------
371 !
372 subroutine batch_init_empty (this, dim, nst, np)
373 type(batch_t), intent(out) :: this
374 integer, intent(in) :: dim
375 integer, intent(in) :: nst
376 integer, intent(in) :: np
377
378 push_sub(batch_init_empty)
379
380 this%is_allocated = .false.
381 this%own_memory = .false.
382 this%special_memory = .false.
383 this%needs_finish_unpack = .false.
384 this%nst = nst
385 this%dim = dim
386 this%type_of = type_none
387
388 this%nst_linear = nst*dim
389
390 this%np = np
391 this%device_buffer_count = 0
392 this%host_buffer_count = 0
393 this%status_of = batch_not_packed
394 this%status_host = batch_not_packed
395
396 this%ndims = 2
397 safe_allocate(this%ist_idim_index(1:this%nst_linear, 1:this%ndims))
398 safe_allocate(this%ist(1:this%nst))
399
400 nullify(this%dff, this%zff, this%dff_linear, this%zff_linear)
401 nullify(this%dff_pack, this%zff_pack)
402
403 pop_sub(batch_init_empty)
404 end subroutine batch_init_empty
405
406 !--------------------------------------------------------------
411 !
412 subroutine batch_clone_to(this, dest, pack, copy_data, new_np, special, dest_type)
413 class(batch_t), intent(in) :: this
414 class(batch_t), allocatable, intent(out) :: dest
415 logical, optional, intent(in) :: pack
417 logical, optional, intent(in) :: copy_data
419 integer, optional, intent(in) :: new_np
420 logical, optional, intent(in) :: special
422 type(type_t), optional, intent(in) :: dest_type
423
424 push_sub(batch_clone_to)
425
426 if (.not. allocated(dest)) then
427 safe_allocate_type(batch_t, dest)
428 else
429 message(1) = "Internal error: destination batch in batch_clone_to has been previously allocated."
430 call messages_fatal(1)
431 end if
432
433 call this%copy_to(dest, pack, copy_data, new_np, special, dest_type)
434
435 pop_sub(batch_clone_to)
436 end subroutine batch_clone_to
437
438 !--------------------------------------------------------------
439
440 subroutine batch_clone_to_array(this, dest, n_batches, pack, copy_data, new_np, special, dest_type)
441 class(batch_t), intent(in) :: this
442 class(batch_t), allocatable, intent(out) :: dest(:)
443 integer, intent(in) :: n_batches
444 logical, optional, intent(in) :: pack
446 logical, optional, intent(in) :: copy_data
448 integer, optional, intent(in) :: new_np
449 logical, optional, intent(in) :: special
451 type(type_t), optional, intent(in) :: dest_type
452
453 integer :: ib
454
455 push_sub(batch_clone_to_array)
456
457 if (.not. allocated(dest)) then
458 safe_allocate_type_array(batch_t, dest, (1:n_batches))
459 else
460 message(1) = "Internal error: destination batch in batch_clone_to_array has been previously allocated."
461 call messages_fatal(1)
462 end if
463
464 do ib = 1, n_batches
465 call this%copy_to(dest(ib), pack, copy_data, new_np, special, dest_type)
466 end do
467
468 pop_sub(batch_clone_to_array)
469 end subroutine batch_clone_to_array
470
471 !--------------------------------------------------------------
476 subroutine batch_copy_to(this, dest, pack, copy_data, new_np, special, dest_type)
477 class(batch_t), intent(in) :: this
478 class(batch_t), intent(out) :: dest
479 logical, optional, intent(in) :: pack
481 logical, optional, intent(in) :: copy_data
483 integer, optional, intent(in) :: new_np
484 logical, optional, intent(in) :: special
486 type(type_t), optional, intent(in) :: dest_type
487
488 logical :: host_packed, special_
489 integer :: np_
490 type(type_t) :: type_
491
492 push_sub(batch_copy_to)
493
494 np_ = optional_default(new_np, this%np)
495
496 host_packed = this%host_buffer_count > 0
497 ! use special memory here only for batches not on the GPU to avoid allocating
498 ! pinned memory for temporary batches because that leads to a severe performance
499 ! decrease for GPU runs (up to 20x)
500 if (present(special)) then
501 special_ = this%special_memory
502 else
503 special_ = this%special_memory .and. .not. this%device_buffer_count > 0
504 end if
506 if (present(dest_type)) then
507 type_ = dest_type
508 else
509 type_ = this%type()
510 end if
511
512 if (type_ == type_float) then
513 call dbatch_init(dest, this%dim, 1, this%nst, np_, packed=host_packed, special=special_)
514 else if (type_ == type_cmplx) then
515 call zbatch_init(dest, this%dim, 1, this%nst, np_, packed=host_packed, special=special_)
516 else
517 message(1) = "Internal error: unknown batch type in batch_copy_to."
518 call messages_fatal(1)
519 end if
520
521 if (this%status() /= dest%status() .and. optional_default(pack, this%is_packed())) call dest%do_pack(copy = .false.)
522
523 dest%ist_idim_index(1:this%nst_linear, 1:this%ndims) = this%ist_idim_index(1:this%nst_linear, 1:this%ndims)
524 dest%ist(1:this%nst) = this%ist(1:this%nst)
525
526 if (optional_default(copy_data, .false.)) then
527 assert(np_ == this%np)
528 call this%copy_data_to(min(this%np, np_), dest)
529 end if
530
531 pop_sub(batch_copy_to)
532 end subroutine batch_copy_to
534 ! ----------------------------------------------------
539 type(type_t) pure function batch_type(this) result(btype)
540 class(batch_t), intent(in) :: this
541
542 btype = this%type_of
543
544 end function batch_type
545
546 ! ----------------------------------------------------
548 integer pure function batch_type_as_integer(this) result(itype)
549 class(batch_t), intent(in) :: this
550
551 type(type_t) :: btype
552
553 itype = 0
554 btype = this%type()
555 if (btype == type_float) itype = 1
556 if (btype == type_cmplx) itype = 2
557
558 end function batch_type_as_integer
559
560 ! ----------------------------------------------------
565 integer pure function batch_status(this) result(bstatus)
566 class(batch_t), intent(in) :: this
567
568 bstatus = this%status_of
569 end function batch_status
570
571 ! ----------------------------------------------------
572
573 logical pure function batch_is_packed(this) result(in_buffer)
574 class(batch_t), intent(in) :: this
575
576 in_buffer = (this%device_buffer_count > 0) .or. (this%host_buffer_count > 0)
577 end function batch_is_packed
578
579 ! ----------------------------------------------------
580
581 integer(int64) function batch_pack_total_size(this) result(size)
582 class(batch_t), intent(inout) :: this
583
584 size = this%np
585 if (accel_is_enabled()) size = accel_padded_size(size)
586 size = size*pad_pow2(this%nst_linear)*types_get_size(this%type())
587
588 end function batch_pack_total_size
589
590 ! ----------------------------------------------------
591
597 subroutine batch_do_pack(this, copy, async)
598 class(batch_t), intent(inout) :: this
599 logical, optional, intent(in) :: copy
600 logical, optional, intent(in) :: async
602
603 logical :: copy_
604 logical :: async_
605 integer :: source, target
606
607 ! no push_sub, called too frequently
608
609 call profiling_in("BATCH_DO_PACK")
610
611 copy_ = optional_default(copy, .true.)
612
613 async_ = optional_default(async, .false.)
614
615 ! get source and target states for this batch
616 source = this%status()
617 select case (source)
619 if (accel_is_enabled()) then
620 target = batch_device_packed
621 else
622 target = batch_packed
623 end if
625 target = batch_device_packed
626 end select
627
628 ! only do something if target is different from source
629 if (source /= target) then
630 select case (target)
632 call this%allocate_packed_device()
633 this%status_of = batch_device_packed ! Note that this%status_host remains untouched.
634
635 if (copy_) then
636 select case (source)
637 case (batch_not_packed)
638 ! copy from unpacked host array to device
640 case (batch_packed)
641 ! copy from packed host array to device
642 call batch_write_packed_to_device(this, async_)
643 end select
644 end if
645 case (batch_packed)
646 call this%allocate_packed_host()
647 this%status_of = batch_packed
648 this%status_host = batch_packed
649
650 if (copy_) then
651 if (this%type() == type_float) then
652 call dbatch_pack_copy(this)
653 else if (this%type() == type_cmplx) then
654 call zbatch_pack_copy(this)
655 end if
656 end if
657 if (this%own_memory) call this%deallocate_unpacked_host()
658 end select
659 end if
660
661 select case (target)
663 this%device_buffer_count = this%device_buffer_count + 1
664 case (batch_packed)
665 this%host_buffer_count = this%host_buffer_count + 1
666 end select
667
668 call profiling_out("BATCH_DO_PACK")
669 end subroutine batch_do_pack
670
671 ! ----------------------------------------------------
676 subroutine batch_do_unpack(this, copy, force, async)
677 class(batch_t), intent(inout) :: this
678 logical, optional, intent(in) :: copy
679 logical, optional, intent(in) :: force
680 logical, optional, intent(in) :: async
682
683 logical :: copy_, force_, async_
684 integer :: source, target
685
686 push_sub(batch_do_unpack)
687
688 call profiling_in("BATCH_DO_UNPACK")
689
690 copy_ = optional_default(copy, .true.)
691
692 force_ = optional_default(force, .false.)
693
694 async_ = optional_default(async, .false.)
695
696 ! get source and target states for this batch
697 source = this%status()
698 select case (source)
699 case (batch_not_packed)
700 target = source
701 case (batch_packed)
702 target = batch_not_packed
704 target = this%status_host
705 end select
706
707 ! only do something if target is different from source
708 if (source /= target) then
709 select case (source)
710 case (batch_packed)
711 if (this%host_buffer_count == 1 .or. force_) then
712 if (this%own_memory) call this%allocate_unpacked_host()
713 ! unpack from packed_host to unpacked_host
714 if (copy_ .or. this%own_memory) then
715 if (this%type() == type_float) then
716 call dbatch_unpack_copy(this)
717 else if (this%type() == type_cmplx) then
718 call zbatch_unpack_copy(this)
719 end if
720 end if
721 call this%deallocate_packed_host()
722 this%status_host = target
723 this%status_of = target
724 this%host_buffer_count = 1
725 end if
726 this%host_buffer_count = this%host_buffer_count - 1
728 if (this%device_buffer_count == 1 .or. force_) then
729 if (copy_) then
730 select case (target)
731 ! unpack from packed_device to unpacked_host
732 case (batch_not_packed)
734 ! unpack from packed_device to packed_host
735 case (batch_packed)
736 call batch_read_device_to_packed(this, async_)
737 end select
738 end if
739 if (async_) then
740 this%needs_finish_unpack = .true.
741 else
742 call this%deallocate_packed_device()
743 end if
744 this%status_of = target
745 this%device_buffer_count = 1
746 end if
747 this%device_buffer_count = this%device_buffer_count - 1
748 end select
749 end if
750
751 call profiling_out("BATCH_DO_UNPACK")
752
753 pop_sub(batch_do_unpack)
754 end subroutine batch_do_unpack
755
756 ! ----------------------------------------------------
758 subroutine batch_finish_unpack(this)
759 class(batch_t), intent(inout) :: this
760
761 push_sub(batch_finish_unpack)
762 if (this%needs_finish_unpack) then
763 call accel_finish()
764 call this%deallocate_packed_device()
765 this%needs_finish_unpack = .false.
766 end if
767 pop_sub(batch_finish_unpack)
768 end subroutine batch_finish_unpack
770 ! ----------------------------------------------------
771
772 subroutine batch_write_unpacked_to_device(this)
773 class(batch_t), intent(inout) :: this
774
775 integer :: ist, ist2
776 integer(int64) :: unroll
777 type(accel_mem_t) :: tmp
778 type(accel_kernel_t), pointer :: kernel
779
781
782 call profiling_in("BATCH_WRT_UNPACK_ACCEL")
783 if (this%nst_linear == 1) then
784 ! we can copy directly
785 if (this%type() == type_float) then
786 call accel_write_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1))
787 else if (this%type() == type_cmplx) then
788 call accel_write_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1))
789 else
790 assert(.false.)
791 end if
792
793 else
794 ! we copy to a temporary array and then we re-arrange data
795
796 if (this%type() == type_float) then
797 kernel => dpack
798 else
799 kernel => zpack
800 end if
801
802 unroll = min(int(cl_pack_max_buffer_size, int64), this%pack_size(1))
803
804 call accel_create_buffer(tmp, accel_mem_read_only, this%type(), unroll*this%pack_size(2))
805
806 do ist = 1, this%nst_linear, int(unroll, int32)
807
808 ! copy a number 'unroll' of states to the buffer
809 do ist2 = ist, min(ist + int(unroll, int32) - 1, this%nst_linear)
810
811 if (this%type() == type_float) then
812 call accel_write_buffer(tmp, ubound(this%dff_linear, dim=1, kind=int64), this%dff_linear(:, ist2), &
813 offset = (ist2 - ist)*this%pack_size(2))
814 else
815 call accel_write_buffer(tmp, ubound(this%zff_linear, dim=1, kind=int64), this%zff_linear(:, ist2), &
816 offset = (ist2 - ist)*this%pack_size(2))
817 end if
818 end do
819
820 ! now call an opencl kernel to rearrange the data
821 call accel_set_kernel_arg(kernel, 0, int(this%pack_size(1), int32))
822 call accel_set_kernel_arg(kernel, 1, int(this%pack_size(2), int32))
823 call accel_set_kernel_arg(kernel, 2, ist - 1)
824 call accel_set_kernel_arg(kernel, 3, tmp)
825 call accel_set_kernel_arg(kernel, 4, this%ff_device)
826
827 call profiling_in("CL_PACK")
828 call accel_kernel_run(kernel, (/this%pack_size(2), unroll/), (/accel_max_workgroup_size()/unroll, unroll/))
829
830 if (this%type() == type_float) then
831 call profiling_count_transfers(unroll*this%pack_size(2), m_one)
832 else
833 call profiling_count_transfers(unroll*this%pack_size(2), m_zi)
834 end if
835
836 call accel_finish()
837 call profiling_out("CL_PACK")
838
839 end do
840
841 call accel_release_buffer(tmp)
842
843 end if
844
845 call profiling_out("BATCH_WRT_UNPACK_ACCEL")
847 end subroutine batch_write_unpacked_to_device
848
849 ! ------------------------------------------------------------------
850
852 class(batch_t), intent(inout) :: this
853
854 integer :: ist, ist2
855 integer(int64) :: unroll
856 type(accel_mem_t) :: tmp
857 type(accel_kernel_t), pointer :: kernel
858
860 call profiling_in("BATCH_READ_UNPACKED_ACCEL")
861
862 if (this%nst_linear == 1) then
863 ! we can copy directly
864 if (this%type() == type_float) then
865 call accel_read_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1))
866 else
867 call accel_read_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1))
868 end if
869 else
870
871 unroll = min(int(cl_pack_max_buffer_size, int64), this%pack_size(1))
872
873 ! we use a kernel to move to a temporary array and then we read
874 call accel_create_buffer(tmp, accel_mem_write_only, this%type(), unroll*this%pack_size(2))
875
876 if (this%type() == type_float) then
877 kernel => dunpack
878 else
879 kernel => zunpack
880 end if
881
882 do ist = 1, this%nst_linear, int(unroll, int32)
883 call accel_set_kernel_arg(kernel, 0, int(this%pack_size(1), int32))
884 call accel_set_kernel_arg(kernel, 1, int(this%pack_size(2), int32))
885 call accel_set_kernel_arg(kernel, 2, ist - 1)
886 call accel_set_kernel_arg(kernel, 3, this%ff_device)
887 call accel_set_kernel_arg(kernel, 4, tmp)
888
889 call profiling_in("CL_UNPACK")
890 call accel_kernel_run(kernel, (/unroll, this%pack_size(2)/), (/unroll, accel_max_workgroup_size()/unroll/))
891
892 if (this%type() == type_float) then
893 call profiling_count_transfers(unroll*this%pack_size(2), m_one)
894 else
895 call profiling_count_transfers(unroll*this%pack_size(2), m_zi)
896 end if
897
898 call accel_finish()
899 call profiling_out("CL_UNPACK")
900
901 ! copy a number 'unroll' of states from the buffer
902 do ist2 = ist, min(ist + int(unroll, int32) - 1, this%nst_linear)
903
904 if (this%type() == type_float) then
905 call accel_read_buffer(tmp, ubound(this%dff_linear, dim=1, kind=int64), this%dff_linear(:, ist2), &
906 offset = (ist2 - ist)*this%pack_size(2))
907 else
908 call accel_read_buffer(tmp, ubound(this%zff_linear, dim=1, kind=int64), this%zff_linear(:, ist2), &
909 offset = (ist2 - ist)*this%pack_size(2))
910 end if
911 end do
912
913 end do
914
915 call accel_release_buffer(tmp)
916 end if
917
918 call profiling_out("BATCH_READ_UNPACKED_ACCEL")
920 end subroutine batch_read_device_to_unpacked
921
922 ! ------------------------------------------------------------------
923 subroutine batch_write_packed_to_device(this, async)
924 class(batch_t), intent(inout) :: this
925 logical, optional, intent(in) :: async
926
927
929
930 call profiling_in("BATCH_WRITE_PACKED_ACCEL")
931 if (this%type() == type_float) then
932 call accel_write_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async)
933 else
934 call accel_write_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async)
935 end if
936 call profiling_out("BATCH_WRITE_PACKED_ACCEL")
937
939 end subroutine batch_write_packed_to_device
940
941 ! ------------------------------------------------------------------
942 subroutine batch_read_device_to_packed(this, async)
943 class(batch_t), intent(inout) :: this
944 logical, optional, intent(in) :: async
945
946
948
949 call profiling_in("BATCH_READ_PACKED_ACCEL")
950 if (this%type() == type_float) then
951 call accel_read_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async)
952 else
953 call accel_read_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async)
954 end if
955 call profiling_out("BATCH_READ_PACKED_ACCEL")
956
958 end subroutine batch_read_device_to_packed
959
960 ! ------------------------------------------------------
965 integer function batch_inv_index(this, cind) result(index)
966 class(batch_t), intent(in) :: this
967 integer, intent(in) :: cind(:)
968
969 do index = 1, this%nst_linear
970 if (all(cind(1:this%ndims) == this%ist_idim_index(index, 1:this%ndims))) exit
971 end do
972
973 assert(index <= this%nst_linear)
974
975 end function batch_inv_index
976
977 ! ------------------------------------------------------
981 !
982 integer pure function batch_ist_idim_to_linear(this, cind) result(index)
983 class(batch_t), intent(in) :: this
984 integer, intent(in) :: cind(:)
985
986 if (ubound(cind, dim = 1) == 1) then
987 index = cind(1)
988 else
989 index = (cind(1) - 1)*this%dim + cind(2)
990 end if
991
992 end function batch_ist_idim_to_linear
993
994 ! ------------------------------------------------------
999 !
1000 integer pure function batch_linear_to_ist(this, linear_index) result(ist)
1001 class(batch_t), intent(in) :: this
1002 integer, intent(in) :: linear_index
1003
1004 ist = this%ist_idim_index(linear_index, 1)
1005
1006 end function batch_linear_to_ist
1007
1008 ! ------------------------------------------------------
1010 !
1011 integer pure function batch_linear_to_idim(this, linear_index) result(idim)
1012 class(batch_t), intent(in) :: this
1013 integer, intent(in) :: linear_index
1014
1015 idim = this%ist_idim_index(linear_index, 2)
1017 end function batch_linear_to_idim
1018
1019 ! ------------------------------------------------------
1028 !
1029 subroutine batch_remote_access_start(this, mpi_grp, rma_win)
1030 class(batch_t), intent(inout) :: this
1031 type(mpi_grp_t), intent(in) :: mpi_grp
1032 type(mpi_win), intent(out) :: rma_win
1033
1036 if (mpi_grp%size > 1) then
1037
1038 assert(.not. accel_is_enabled())
1039
1040 call this%do_pack()
1041
1042 if (this%type() == type_cmplx) then
1043#ifdef HAVE_MPI
1044 call mpi_win_create(this%zff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), mpi_address_kind), &
1045 types_get_size(this%type()), mpi_info_null, mpi_grp%comm, rma_win, mpi_err)
1046#endif
1047 else if (this%type() == type_float) then
1048#ifdef HAVE_MPI
1049 call mpi_win_create(this%dff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), mpi_address_kind), &
1050 types_get_size(this%type()), mpi_info_null, mpi_grp%comm, rma_win, mpi_err)
1051#endif
1052 else
1053 message(1) = "Internal error: unknown batch type in batch_remote_access_start."
1054 call messages_fatal(1)
1055 end if
1056
1057 else
1058 rma_win = mpi_win_null
1059 end if
1060
1062 end subroutine batch_remote_access_start
1063
1064 ! ------------------------------------------------------
1070 !
1071 subroutine batch_remote_access_stop(this, rma_win)
1072 class(batch_t), intent(inout) :: this
1073 type(mpi_win), intent(inout) :: rma_win
1074
1076
1077 if (rma_win /= mpi_win_null) then
1078#ifdef HAVE_MPI
1079 call mpi_win_free(rma_win, mpi_err)
1080#endif
1081 call this%do_unpack()
1082 end if
1083
1085 end subroutine batch_remote_access_stop
1086
1087 ! --------------------------------------------------------------
1089 !
1090 subroutine batch_copy_data_to(this, np, dest, async)
1091 class(batch_t), intent(in) :: this
1092 integer, intent(in) :: np
1093 class(batch_t), intent(inout) :: dest
1094 logical, optional, intent(in) :: async
1095
1096 integer(int64) :: localsize, dim2, dim3
1097 integer :: ist, ip
1098
1099 push_sub(batch_copy_data_to)
1100 call profiling_in("BATCH_COPY_DATA_TO")
1101
1102 ! this routine can be used to copy data between batches of different type
1103 call this%check_compatibility_with(dest, type_check=.false.)
1105 if (this%type() == dest%type()) then
1106 select case (this%status())
1107 case (batch_device_packed)
1108 call accel_set_kernel_arg(kernel_copy, 0, np)
1109 call accel_set_kernel_arg(kernel_copy, 1, this%ff_device)
1110 call accel_set_kernel_arg(kernel_copy, 2, log2(int(this%pack_size_real(1), int32)))
1111 call accel_set_kernel_arg(kernel_copy, 3, dest%ff_device)
1112 call accel_set_kernel_arg(kernel_copy, 4, log2(int(dest%pack_size_real(1), int32)))
1113
1114 localsize = accel_kernel_workgroup_size(kernel_copy)/dest%pack_size_real(1)
1115
1116 dim3 = np/(accel_max_size_per_dim(2)*localsize) + 1
1117 dim2 = min(accel_max_size_per_dim(2)*localsize, pad(int(np, int64), localsize))
1118
1119 call accel_kernel_run(kernel_copy, (/dest%pack_size_real(1), dim2, dim3/), (/dest%pack_size_real(1), localsize, 1_int64/))
1120
1121 if(.not. optional_default(async, .false.)) call accel_finish()
1123 case (batch_packed)
1124 if (np*this%pack_size(1) > huge(0_int32)) then
1125 ! BLAS cannot handle 8-byte integers, so we need a special version here
1126 do ip = 1, np
1127 if (dest%type() == type_float) then
1128 call blas_copy(int(this%pack_size(1), int32), this%dff_pack(1, ip), 1, dest%dff_pack(1, ip), 1)
1129 else
1130 call blas_copy(int(this%pack_size(1), int32), this%zff_pack(1, ip), 1, dest%zff_pack(1, ip), 1)
1131 end if
1132 end do
1133 else
1134 if (dest%type() == type_float) then
1135 call blas_copy(int(this%pack_size(1)*np, int32), this%dff_pack(1, 1), 1, dest%dff_pack(1, 1), 1)
1136 else
1137 call blas_copy(int(this%pack_size(1)*np, int32), this%zff_pack(1, 1), 1, dest%zff_pack(1, 1), 1)
1138 end if
1139 end if
1140
1141 case (batch_not_packed)
1142 do ist = 1, dest%nst_linear
1143 if (dest%type() == type_cmplx) then
1144 call blas_copy(np, this%zff_linear(1, ist), 1, dest%zff_linear(1, ist), 1)
1145 else
1146 call blas_copy(np, this%dff_linear(1, ist), 1, dest%dff_linear(1, ist), 1)
1147 end if
1148 end do
1149
1150 end select
1151 else if (this%type() == type_cmplx) then
1152 ! copy complex -> real
1153 select case (this%status())
1154 case (batch_device_packed)
1155 call accel_set_kernel_arg(kernel_copy_complex_to_real, 0, np)
1156 call accel_set_kernel_arg(kernel_copy_complex_to_real, 1, this%ff_device)
1157 call accel_set_kernel_arg(kernel_copy_complex_to_real, 2, log2(int(this%pack_size_real(1), int32)))
1158 call accel_set_kernel_arg(kernel_copy_complex_to_real, 3, dest%ff_device)
1159 call accel_set_kernel_arg(kernel_copy_complex_to_real, 4, log2(int(dest%pack_size_real(1), int32)))
1160
1161 localsize = accel_kernel_workgroup_size(kernel_copy_complex_to_real)/dest%pack_size_real(1)
1162
1163 dim3 = np/(accel_max_size_per_dim(2)*localsize) + 1
1164 dim2 = min(accel_max_size_per_dim(2)*localsize, pad(int(np, int64), localsize))
1165
1166 call accel_kernel_run(kernel_copy_complex_to_real, &
1167 (/dest%pack_size_real(1), dim2, dim3/), (/dest%pack_size_real(1), localsize, 1_int64/))
1168
1169 if(.not. optional_default(async, .false.)) call accel_finish()
1170
1171 case (batch_packed)
1172 !$omp parallel do private(ist, ip)
1173 do ip = 1, np
1174 !$omp simd
1175 do ist = 1, dest%nst_linear
1176 dest%dff_pack(ist, ip) = real(this%zff_pack(ist, ip), real64)
1177 end do
1178 end do
1179 !$omp end parallel do
1180 case (batch_not_packed)
1181 !$omp parallel private(ist, ip)
1182 do ist = 1, dest%nst_linear
1183 !$omp do
1184 do ip = 1, np
1185 dest%dff_linear(ip, ist) = real(this%zff_linear(ip, ist), real64)
1186 end do
1187 !$omp end do nowait
1188 end do
1189 !$omp end parallel
1190 end select
1191 else if (this%type() == type_float) then
1192 ! copy real -> complex
1193 select case (this%status())
1194 case (batch_device_packed)
1195 call accel_set_kernel_arg(kernel_copy_real_to_complex, 0, np)
1196 call accel_set_kernel_arg(kernel_copy_real_to_complex, 1, this%ff_device)
1197 call accel_set_kernel_arg(kernel_copy_real_to_complex, 2, log2(int(this%pack_size_real(1), int32)))
1198 call accel_set_kernel_arg(kernel_copy_real_to_complex, 3, dest%ff_device)
1199 call accel_set_kernel_arg(kernel_copy_real_to_complex, 4, log2(int(dest%pack_size_real(1), int32)))
1200
1201 localsize = accel_kernel_workgroup_size(kernel_copy_real_to_complex)/this%pack_size_real(1)
1202
1203 dim3 = np/(accel_max_size_per_dim(2)*localsize) + 1
1204 dim2 = min(accel_max_size_per_dim(2)*localsize, pad(int(np, int64), localsize))
1205
1206 call accel_kernel_run(kernel_copy_real_to_complex, &
1207 (/this%pack_size_real(1), dim2, dim3/), (/this%pack_size_real(1), localsize, 1_int64/))
1208
1209 if(.not. optional_default(async, .false.)) call accel_finish()
1210
1211 case (batch_packed)
1212 !$omp parallel do private(ist, ip)
1213 do ip = 1, np
1214 !$omp simd
1215 do ist = 1, dest%nst_linear
1216 dest%zff_pack(ist, ip) = cmplx(this%dff_pack(ist, ip), m_zero, real64)
1217 end do
1218 end do
1219 !$omp end parallel do
1220 case (batch_not_packed)
1221 !$omp parallel private(ist, ip)
1222 do ist = 1, dest%nst_linear
1223 !$omp do
1224 do ip = 1, np
1225 dest%zff_linear(ip, ist) = cmplx(this%dff_linear(ip, ist), m_zero, real64)
1226 end do
1227 !$omp end do nowait
1228 end do
1229 !$omp end parallel
1230 end select
1231 else
1232 message(1) = "Error! This should not happen."
1233 call messages_fatal(1)
1234 end if
1235
1236 call profiling_out("BATCH_COPY_DATA_TO")
1237 pop_sub(batch_copy_data_to)
1238 end subroutine batch_copy_data_to
1239
1240 ! --------------------------------------------------------------
1242 !
1243 subroutine batch_check_compatibility_with(this, target, only_check_dim, type_check)
1244 class(batch_t), intent(in) :: this
1245 class(batch_t), intent(in) :: target
1246 logical, optional, intent(in) :: only_check_dim
1247 logical, optional, intent(in) :: type_check
1248
1250
1251 if (optional_default(type_check, .true.)) then
1252 assert(this%type() == target%type())
1253 end if
1254 if (.not. optional_default(only_check_dim, .false.)) then
1255 assert(this%nst_linear == target%nst_linear)
1256 end if
1257 assert(this%status() == target%status())
1258 assert(this%dim == target%dim)
1259
1261
1262 end subroutine batch_check_compatibility_with
1263
1264!--------------------------------------------------------------
1267 subroutine batch_build_indices(this, st_start, st_end)
1268 class(batch_t), intent(inout) :: this
1269 integer, intent(in) :: st_start
1270 integer, intent(in) :: st_end
1271
1272 integer :: idim, ii, ist
1273
1274 push_sub(batch_build_indices)
1275
1276 do ist = st_start, st_end
1277 ! now we also populate the linear array
1278 do idim = 1, this%dim
1279 ii = this%dim*(ist - st_start) + idim
1280 this%ist_idim_index(ii, 1) = ist
1281 this%ist_idim_index(ii, 2) = idim
1282 end do
1283 this%ist(ist - st_start + 1) = ist
1284 end do
1285
1286 ! compute packed sizes
1287 this%pack_size(1) = pad_pow2(this%nst_linear)
1288 this%pack_size(2) = this%np
1289 if (accel_is_enabled()) this%pack_size(2) = accel_padded_size(this%pack_size(2))
1290
1291 this%pack_size_real = this%pack_size
1292 if (type_is_complex(this%type())) this%pack_size_real(1) = 2*this%pack_size_real(1)
1293
1294 pop_sub(batch_build_indices)
1295 end subroutine batch_build_indices
1296
1297
1298#include "real.F90"
1299#include "batch_inc.F90"
1300#include "undef.F90"
1301
1302#include "complex.F90"
1303#include "batch_inc.F90"
1304#include "undef.F90"
1305
1306end module batch_oct_m
1307
1308!! Local Variables:
1309!! mode: f90
1310!! coding: utf-8
1311!! End:
initialize a batch with existing memory
Definition: batch.F90:273
double log2(double __x) __attribute__((__nothrow__
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine, public accel_release_buffer(this)
Definition: accel.F90:1250
This module contains interfaces for routines in allocate_hardware_aware.c.
subroutine, public deallocate_hardware_aware(array, size)
This module implements batches of mesh functions.
Definition: batch.F90:133
type(type_t) pure function batch_type(this)
return the type of a batch
Definition: batch.F90:633
subroutine zbatch_pack_copy(this)
copy data from the unpacked to the packed arrays
Definition: batch.F90:1924
integer, parameter, public batch_not_packed
functions are stored in CPU memory, unpacked order
Definition: batch.F90:282
integer, parameter, public batch_device_packed
functions are stored in device memory in packed order
Definition: batch.F90:282
subroutine zbatch_init_with_memory_3(this, dim, st_start, st_end, psi)
initialize a batch with an rank-3 array of TYPE_CMPLX valued mesh functions psi.
Definition: batch.F90:1748
subroutine batch_check_compatibility_with(this, target, only_check_dim, type_check)
check whether two batches have compatible dimensions (and type)
Definition: batch.F90:1337
integer, parameter cl_pack_max_buffer_size
this value controls the size (in number of wave-functions) of the buffer used to copy states to the o...
Definition: batch.F90:287
logical pure function batch_is_packed(this)
Definition: batch.F90:667
subroutine dbatch_unpack_copy(this)
copy data from the packed to the unpacked arrays
Definition: batch.F90:1649
subroutine dbatch_init_with_memory_1(this, psi)
initialize a batch with an rank-1 array of TYPE_FLOAT valued mesh functions psi.
Definition: batch.F90:1509
subroutine batch_write_unpacked_to_device(this)
Definition: batch.F90:866
subroutine batch_do_unpack(this, copy, force, async)
unpack a batch
Definition: batch.F90:770
subroutine batch_finish_unpack(this)
finish the unpacking if do_unpack() was called with async=.true.
Definition: batch.F90:852
subroutine zbatch_allocate_unpacked_host(this)
allocate host (CPU) memory for unpacked data of type TYPE_CMPLX
Definition: batch.F90:1835
subroutine batch_deallocate_packed_device(this)
release packed device memory
Definition: batch.F90:395
integer pure function batch_type_as_integer(this)
For debuging purpose only.
Definition: batch.F90:642
integer function batch_inv_index(this, cind)
inverse index lookup
Definition: batch.F90:1059
subroutine dbatch_init_with_memory_2(this, dim, st_start, st_end, psi)
initialize a batch with an rank-2 array of TYPE_FLOAT valued mesh functions psi.
Definition: batch.F90:1480
subroutine batch_allocate_packed_host(this)
allocate host (CPU) memory for packed data
Definition: batch.F90:429
subroutine, public zbatch_init(this, dim, st_start, st_end, np, special, packed)
initialize a TYPE_CMPLX valued batch to given size without providing external memory
Definition: batch.F90:1885
subroutine zbatch_allocate_packed_host(this)
allocate host (CPU) memory for packed data of type TYPE_CMPLX
Definition: batch.F90:1861
subroutine batch_clone_to(this, dest, pack, copy_data, new_np, special, dest_type)
clone a batch to a new batch
Definition: batch.F90:506
subroutine batch_remote_access_stop(this, rma_win)
stop the remote access to the batch
Definition: batch.F90:1165
subroutine batch_read_device_to_unpacked(this)
Definition: batch.F90:945
subroutine zbatch_init_with_memory_1(this, psi)
initialize a batch with an rank-1 array of TYPE_CMPLX valued mesh functions psi.
Definition: batch.F90:1814
subroutine dbatch_allocate_packed_host(this)
allocate host (CPU) memory for packed data of type TYPE_FLOAT
Definition: batch.F90:1556
subroutine batch_clone_to_array(this, dest, n_batches, pack, copy_data, new_np, special, dest_type)
Definition: batch.F90:534
subroutine batch_allocate_packed_device(this)
allocate device (GPU) memory for packed data
Definition: batch.F90:448
subroutine batch_build_indices(this, st_start, st_end)
build the index ist(:) and ist_idim_index(:,:) and set pack_size
Definition: batch.F90:1361
integer pure function batch_ist_idim_to_linear(this, cind)
direct index lookup
Definition: batch.F90:1076
integer pure function batch_linear_to_ist(this, linear_index)
get state index ist from linear (combined dim and nst) index
Definition: batch.F90:1094
subroutine batch_copy_to(this, dest, pack, copy_data, new_np, special, dest_type)
make a copy of a batch
Definition: batch.F90:570
subroutine, public batch_read_device_to_packed(this, async)
Definition: batch.F90:1036
subroutine batch_write_packed_to_device(this, async)
Definition: batch.F90:1017
subroutine dbatch_init_with_memory_3(this, dim, st_start, st_end, psi)
initialize a batch with an rank-3 array of TYPE_FLOAT valued mesh functions psi.
Definition: batch.F90:1443
subroutine batch_allocate_unpacked_host(this)
allocate host (CPU) memory for unpacked data
Definition: batch.F90:410
subroutine batch_init_empty(this, dim, nst, np)
initialize an empty batch
Definition: batch.F90:466
subroutine, public dbatch_init(this, dim, st_start, st_end, np, special, packed)
initialize a TYPE_FLOAT valued batch to given size without providing external memory
Definition: batch.F90:1580
subroutine zbatch_init_with_memory_2(this, dim, st_start, st_end, psi)
initialize a batch with an rank-2 array of TYPE_CMPLX valued mesh functions psi.
Definition: batch.F90:1785
integer pure function batch_linear_to_idim(this, linear_index)
extract idim from linear index
Definition: batch.F90:1105
subroutine batch_remote_access_start(this, mpi_grp, rma_win)
start remote access to a batch on another node
Definition: batch.F90:1123
subroutine batch_copy_data_to(this, np, dest, async)
copy data to another batch.
Definition: batch.F90:1184
subroutine batch_do_pack(this, copy, async)
pack the data in a batch
Definition: batch.F90:691
subroutine dbatch_allocate_unpacked_host(this)
allocate host (CPU) memory for unpacked data of type TYPE_FLOAT
Definition: batch.F90:1530
subroutine dbatch_pack_copy(this)
copy data from the unpacked to the packed arrays
Definition: batch.F90:1619
integer pure function batch_status(this)
return the status of a batch
Definition: batch.F90:659
subroutine batch_deallocate_unpacked_host(this)
release unpacked host memory
Definition: batch.F90:339
integer, parameter, public batch_packed
functions are stored in CPU memory, in transposed (packed) order
Definition: batch.F90:282
subroutine batch_deallocate_packed_host(this)
release packed host memory
Definition: batch.F90:370
integer(int64) function batch_pack_total_size(this)
Definition: batch.F90:675
subroutine batch_end(this, copy)
finalize a batch and release allocated memory, if necessary
Definition: batch.F90:301
subroutine zbatch_unpack_copy(this)
copy data from the packed to the unpacked arrays
Definition: batch.F90:1954
This module contains interfaces for BLAS routines You should not use these routines directly....
Definition: blas.F90:118
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:115
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:414
type(type_t), public type_float
Definition: types.F90:133
type(type_t), public type_cmplx
Definition: types.F90:134
type(type_t), public type_none
Definition: types.F90:132
Class defining batches of mesh functions.
Definition: batch.F90:159
int true(void)