Octopus
batch.F90
Go to the documentation of this file.
1!! Copyright (C) 2008 X. Andrade, 2020 S. Ohlmann
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
39!
40module batch_oct_m
41 use accel_oct_m
43 use blas_oct_m
44 use debug_oct_m
45 use global_oct_m
47 use iso_c_binding
48 use math_oct_m
50 use mpi_oct_m
52 use types_oct_m
53
54 implicit none
55
56 private
57 public :: &
58 batch_t, &
59 batch_init, &
63
66 type batch_t
67 private
68 integer, public :: nst
69 integer, public :: dim
70 integer :: np
71 integer :: ndims
72 integer, allocatable :: ist_idim_index(:, :)
73
77 integer, allocatable, public :: ist(:)
83
84 logical :: is_allocated
85 logical :: own_memory
86 ! We also need a linear array with the states in order to calculate derivatives, etc.
87 integer, public :: nst_linear
88
89 integer :: status_of
93 integer :: status_host
102 type(type_t) :: type_of
103 integer :: device_buffer_count
104 integer :: host_buffer_count
105 logical :: special_memory
106 logical :: needs_finish_unpack
107
108
109 ! unpacked variables; linear variables are pointers with different shapes
110 real(real64), pointer, contiguous, public :: dff(:, :, :)
111 complex(real64), pointer, contiguous, public :: zff(:, :, :)
112 real(real64), pointer, contiguous, public :: dff_linear(:, :)
113 complex(real64), pointer, contiguous, public :: zff_linear(:, :)
114
115 ! packed variables; only rank-2 arrays due to padding to powers of 2
116 real(real64), pointer, contiguous, public :: dff_pack(:, :)
117 complex(real64), pointer, contiguous, public :: zff_pack(:, :)
118
119 integer(int64), public :: pack_size(1:2)
121 integer(int64), public :: pack_size_real(1:2)
124
125 type(accel_mem_t), public :: ff_device
126
127 contains
128 procedure :: check_compatibility_with => batch_check_compatibility_with
129 procedure :: clone_to => batch_clone_to
130 procedure :: clone_to_array => batch_clone_to_array
131 procedure :: copy_to => batch_copy_to
132 procedure :: copy_data_to => batch_copy_data_to
133 procedure :: do_pack => batch_do_pack
134 procedure :: do_unpack => batch_do_unpack
135 procedure :: finish_unpack => batch_finish_unpack
136 procedure :: end => batch_end
137 procedure :: inv_index => batch_inv_index
138 procedure :: is_packed => batch_is_packed
139 procedure :: ist_idim_to_linear => batch_ist_idim_to_linear
140 procedure :: linear_to_idim => batch_linear_to_idim
141 procedure :: linear_to_ist => batch_linear_to_ist
142 procedure :: pack_total_size => batch_pack_total_size
143 procedure :: remote_access_start => batch_remote_access_start
144 procedure :: remote_access_stop => batch_remote_access_stop
145 procedure :: status => batch_status
146 procedure :: type => batch_type
147 procedure :: type_as_int => batch_type_as_integer
148 procedure, private :: dallocate_unpacked_host => dbatch_allocate_unpacked_host
149
150 procedure, private :: zallocate_unpacked_host => zbatch_allocate_unpacked_host
151
152 procedure, private :: allocate_unpacked_host => batch_allocate_unpacked_host
153
154 procedure, private :: dallocate_packed_host => dbatch_allocate_packed_host
155
156 procedure, private :: zallocate_packed_host => zbatch_allocate_packed_host
157
158 procedure, private :: allocate_packed_host => batch_allocate_packed_host
160 procedure, private :: allocate_packed_device => batch_allocate_packed_device
162 procedure, private :: deallocate_unpacked_host => batch_deallocate_unpacked_host
164 procedure, private :: deallocate_packed_host => batch_deallocate_packed_host
166 procedure, private :: deallocate_packed_device => batch_deallocate_packed_device
167
168 end type batch_t
169
170 !--------------------------------------------------------------
174 interface batch_init
175 module procedure dbatch_init_with_memory_3
176 module procedure zbatch_init_with_memory_3
179 module procedure dbatch_init_with_memory_1
181 end interface batch_init
183 integer, public, parameter :: &
184 batch_not_packed = 0, & !< functions are stored in CPU memory, unpacked order
185 batch_packed = 1, &
187
188 integer, parameter :: cl_pack_max_buffer_size = 4
190
191contains
192
193 !--------------------------------------------------------------
200 !
201 subroutine batch_end(this, copy)
202 class(batch_t), intent(inout) :: this
203 logical, optional, intent(in) :: copy
206 push_sub(batch_end)
207
208 if (this%own_memory .and. this%is_packed()) then
209 !deallocate directly to avoid unnecessary copies
210 if (this%status() == batch_device_packed) then
211 call this%deallocate_packed_device()
212 end if
213 if (this%status() == batch_packed .or. this%status_host == batch_packed) then
214 call this%deallocate_packed_host()
215 end if
216 this%status_of = batch_not_packed
217 this%status_host = batch_not_packed
218 this%host_buffer_count = 0
219 this%device_buffer_count = 0
220 end if
221 if (this%status() == batch_device_packed) call this%do_unpack(copy, force = .true.)
222 if (this%status() == batch_packed) call this%do_unpack(copy, force = .true.)
224 if (this%is_allocated) then
225 call this%deallocate_unpacked_host()
226 end if
228 safe_deallocate_a(this%ist_idim_index)
229 safe_deallocate_a(this%ist)
231 pop_sub(batch_end)
232 end subroutine batch_end
234 !--------------------------------------------------------------
240 class(batch_t), intent(inout) :: this
244 this%is_allocated = .false.
246 if (this%special_memory) then
247 if (associated(this%dff)) then
248 call deallocate_hardware_aware(c_loc(this%dff(1,1,1)), int(this%np, int64)*this%dim*this%nst*8)
249 end if
250 if (associated(this%zff)) then
251 call deallocate_hardware_aware(c_loc(this%zff(1,1,1)), int(this%np, int64)*this%dim*this%nst*16)
252 end if
253 else
254 safe_deallocate_p(this%dff)
255 safe_deallocate_p(this%zff)
256 end if
257 nullify(this%dff)
258 nullify(this%dff_linear)
259 nullify(this%zff)
260 nullify(this%zff_linear)
261
263 end subroutine batch_deallocate_unpacked_host
264
265 !--------------------------------------------------------------
269 !
270 subroutine batch_deallocate_packed_host(this)
271 class(batch_t), intent(inout) :: this
272
274
275 if (this%special_memory) then
276 if (associated(this%dff_pack)) then
277 call deallocate_hardware_aware(c_loc(this%dff_pack(1,1)), int(this%pack_size(1), int64)*this%pack_size(2)*8)
278 end if
279 if (associated(this%zff_pack)) then
280 call deallocate_hardware_aware(c_loc(this%zff_pack(1,1)), int(this%pack_size(1), int64)*this%pack_size(2)*16)
281 end if
282 else
283 safe_deallocate_p(this%dff_pack)
284 safe_deallocate_p(this%zff_pack)
285 end if
286 nullify(this%dff_pack)
287 nullify(this%zff_pack)
288
290 end subroutine batch_deallocate_packed_host
291
292 !--------------------------------------------------------------
295 subroutine batch_deallocate_packed_device(this)
296 class(batch_t), intent(inout) :: this
297
299
300 call accel_release_buffer(this%ff_device)
301
303 end subroutine batch_deallocate_packed_device
304
305 !--------------------------------------------------------------
309 !
310 subroutine batch_allocate_unpacked_host(this)
311 class(batch_t), intent(inout) :: this
312
314
315 if (this%type() == type_float) then
316 call this%dallocate_unpacked_host()
317 else if (this%type() == type_cmplx) then
318 call this%zallocate_unpacked_host()
319 end if
320
322 end subroutine batch_allocate_unpacked_host
323
324 !--------------------------------------------------------------
328 !
329 subroutine batch_allocate_packed_host(this)
330 class(batch_t), intent(inout) :: this
331
333
334 if (this%type() == type_float) then
335 call this%dallocate_packed_host()
336 else if (this%type() == type_cmplx) then
337 call this%zallocate_packed_host()
338 end if
339
341 end subroutine batch_allocate_packed_host
342
343 !--------------------------------------------------------------
347 !
348 subroutine batch_allocate_packed_device(this)
349 class(batch_t), intent(inout) :: this
350
352
353 call accel_create_buffer(this%ff_device, accel_mem_read_write, this%type(), &
354 product(this%pack_size))
355
357 end subroutine batch_allocate_packed_device
358
359 !--------------------------------------------------------------
365 !
366 subroutine batch_init_empty (this, dim, nst, np)
367 type(batch_t), intent(out) :: this
368 integer, intent(in) :: dim
369 integer, intent(in) :: nst
370 integer, intent(in) :: np
371
372 push_sub(batch_init_empty)
373
374 this%is_allocated = .false.
375 this%own_memory = .false.
376 this%special_memory = .false.
377 this%needs_finish_unpack = .false.
378 this%nst = nst
379 this%dim = dim
380 this%type_of = type_none
381
382 this%nst_linear = nst*dim
383
384 this%np = np
385 this%device_buffer_count = 0
386 this%host_buffer_count = 0
387 this%status_of = batch_not_packed
388 this%status_host = batch_not_packed
389
390 this%ndims = 2
391 safe_allocate(this%ist_idim_index(1:this%nst_linear, 1:this%ndims))
392 safe_allocate(this%ist(1:this%nst))
393
394 nullify(this%dff, this%zff, this%dff_linear, this%zff_linear)
395 nullify(this%dff_pack, this%zff_pack)
396
397 pop_sub(batch_init_empty)
398 end subroutine batch_init_empty
399
400 !--------------------------------------------------------------
405 !
406 subroutine batch_clone_to(this, dest, pack, copy_data, new_np)
407 class(batch_t), intent(in) :: this
408 class(batch_t), allocatable, intent(out) :: dest
409 logical, optional, intent(in) :: pack
411 logical, optional, intent(in) :: copy_data
413 integer, optional, intent(in) :: new_np
414
415 push_sub(batch_clone_to)
416
417 if (.not. allocated(dest)) then
418 safe_allocate_type(batch_t, dest)
419 else
420 message(1) = "Internal error: destination batch in batch_clone_to has been previously allocated."
421 call messages_fatal(1)
422 end if
423
424 call this%copy_to(dest, pack, copy_data, new_np)
425
426 pop_sub(batch_clone_to)
427 end subroutine batch_clone_to
428
429 !--------------------------------------------------------------
430
431 subroutine batch_clone_to_array(this, dest, n_batches, pack, copy_data)
432 class(batch_t), intent(in) :: this
433 class(batch_t), allocatable, intent(out) :: dest(:)
434 integer, intent(in) :: n_batches
435 logical, optional, intent(in) :: pack
437 logical, optional, intent(in) :: copy_data
439
440 integer :: ib
442 push_sub(batch_clone_to_array)
443
444 if (.not. allocated(dest)) then
445 safe_allocate_type_array(batch_t, dest, (1:n_batches))
446 else
447 message(1) = "Internal error: destination batch in batch_clone_to_array has been previously allocated."
448 call messages_fatal(1)
449 end if
450
451 do ib = 1, n_batches
452 call this%copy_to(dest(ib), pack, copy_data)
453 end do
454
455 pop_sub(batch_clone_to_array)
456 end subroutine batch_clone_to_array
457
458 !--------------------------------------------------------------
463 subroutine batch_copy_to(this, dest, pack, copy_data, new_np, special)
464 class(batch_t), intent(in) :: this
465 class(batch_t), intent(out) :: dest
466 logical, optional, intent(in) :: pack
468 logical, optional, intent(in) :: copy_data
470 integer, optional, intent(in) :: new_np
471 logical, optional, intent(in) :: special
473
474 logical :: host_packed, special_
475 integer :: np_
476
477 push_sub(batch_copy_to)
478
479 np_ = optional_default(new_np, this%np)
480
481 host_packed = this%host_buffer_count > 0
482 ! use special memory here only for batches not on the GPU to avoid allocating
483 ! pinned memory for temporary batches because that leads to a severe performance
484 ! decrease for GPU runs (up to 20x)
485 if (present(special)) then
486 special_ = this%special_memory
487 else
488 special_ = this%special_memory .and. .not. this%device_buffer_count > 0
489 end if
490
491 if (this%type() == type_float) then
492 call dbatch_init(dest, this%dim, 1, this%nst, np_, packed=host_packed, special=special_)
493 else if (this%type() == type_cmplx) then
494 call zbatch_init(dest, this%dim, 1, this%nst, np_, packed=host_packed, special=special_)
495 else
496 message(1) = "Internal error: unknown batch type in batch_copy_to."
497 call messages_fatal(1)
498 end if
500 if (this%status() /= dest%status() .and. optional_default(pack, this%is_packed())) call dest%do_pack(copy = .false.)
501
502 dest%ist_idim_index(1:this%nst_linear, 1:this%ndims) = this%ist_idim_index(1:this%nst_linear, 1:this%ndims)
503 dest%ist(1:this%nst) = this%ist(1:this%nst)
504
505 if (optional_default(copy_data, .false.)) then
506 assert(np_ == this%np)
507 call this%copy_data_to(min(this%np, np_), dest)
508 end if
509
510 pop_sub(batch_copy_to)
511 end subroutine batch_copy_to
512
513 ! ----------------------------------------------------
518 type(type_t) pure function batch_type(this) result(btype)
519 class(batch_t), intent(in) :: this
520
521 btype = this%type_of
522
523 end function batch_type
525 ! ----------------------------------------------------
527 integer pure function batch_type_as_integer(this) result(itype)
528 class(batch_t), intent(in) :: this
529
530 type(type_t) :: btype
531
532 itype = 0
533 btype = this%type()
534 if (btype == type_float) itype = 1
535 if (btype == type_cmplx) itype = 2
536
537 end function batch_type_as_integer
538
539 ! ----------------------------------------------------
544 integer pure function batch_status(this) result(bstatus)
545 class(batch_t), intent(in) :: this
546
547 bstatus = this%status_of
548 end function batch_status
549
550 ! ----------------------------------------------------
551
552 logical pure function batch_is_packed(this) result(in_buffer)
553 class(batch_t), intent(in) :: this
554
555 in_buffer = (this%device_buffer_count > 0) .or. (this%host_buffer_count > 0)
556 end function batch_is_packed
557
558 ! ----------------------------------------------------
559
560 integer(int64) function batch_pack_total_size(this) result(size)
561 class(batch_t), intent(inout) :: this
562
563 size = this%np
564 if (accel_is_enabled()) size = accel_padded_size(size)
565 size = size*pad_pow2(this%nst_linear)*types_get_size(this%type())
566
567 end function batch_pack_total_size
568
569 ! ----------------------------------------------------
570
576 subroutine batch_do_pack(this, copy, async)
577 class(batch_t), intent(inout) :: this
578 logical, optional, intent(in) :: copy
579 logical, optional, intent(in) :: async
581
582 logical :: copy_
583 logical :: async_
584 integer :: source, target
585
586 ! no push_sub, called too frequently
587
588 call profiling_in("BATCH_DO_PACK")
589
590 copy_ = optional_default(copy, .true.)
591
592 async_ = optional_default(async, .false.)
593
594 ! get source and target states for this batch
595 source = this%status()
596 select case (source)
598 if (accel_is_enabled()) then
599 target = batch_device_packed
600 else
601 target = batch_packed
602 end if
604 target = batch_device_packed
605 end select
606
607 ! only do something if target is different from source
608 if (source /= target) then
609 select case (target)
611 call this%allocate_packed_device()
612 this%status_of = batch_device_packed ! Note that this%status_host remains untouched.
613
614 if (copy_) then
615 select case (source)
616 case (batch_not_packed)
617 ! copy from unpacked host array to device
619 case (batch_packed)
620 ! copy from packed host array to device
621 call batch_write_packed_to_device(this, async_)
622 end select
623 end if
624 case (batch_packed)
625 call this%allocate_packed_host()
626 this%status_of = batch_packed
627 this%status_host = batch_packed
628
629 if (copy_) then
630 if (this%type() == type_float) then
631 call dbatch_pack_copy(this)
632 else if (this%type() == type_cmplx) then
633 call zbatch_pack_copy(this)
634 end if
635 end if
636 if (this%own_memory) call this%deallocate_unpacked_host()
637 end select
638 end if
639
640 select case (target)
642 this%device_buffer_count = this%device_buffer_count + 1
643 case (batch_packed)
644 this%host_buffer_count = this%host_buffer_count + 1
645 end select
646
647 call profiling_out("BATCH_DO_PACK")
648 end subroutine batch_do_pack
649
650 ! ----------------------------------------------------
655 subroutine batch_do_unpack(this, copy, force, async)
656 class(batch_t), intent(inout) :: this
657 logical, optional, intent(in) :: copy
658 logical, optional, intent(in) :: force
659 logical, optional, intent(in) :: async
661
662 logical :: copy_, force_, async_
663 integer :: source, target
664
665 push_sub(batch_do_unpack)
666
667 call profiling_in("BATCH_DO_UNPACK")
668
669 copy_ = optional_default(copy, .true.)
670
671 force_ = optional_default(force, .false.)
672
673 async_ = optional_default(async, .false.)
674
675 ! get source and target states for this batch
676 source = this%status()
677 select case (source)
678 case (batch_not_packed)
679 target = source
680 case (batch_packed)
681 target = batch_not_packed
683 target = this%status_host
684 end select
685
686 ! only do something if target is different from source
687 if (source /= target) then
688 select case (source)
689 case (batch_packed)
690 if (this%host_buffer_count == 1 .or. force_) then
691 if (this%own_memory) call this%allocate_unpacked_host()
692 ! unpack from packed_host to unpacked_host
693 if (copy_ .or. this%own_memory) then
694 if (this%type() == type_float) then
695 call dbatch_unpack_copy(this)
696 else if (this%type() == type_cmplx) then
697 call zbatch_unpack_copy(this)
698 end if
699 end if
700 call this%deallocate_packed_host()
701 this%status_host = target
702 this%status_of = target
703 this%host_buffer_count = 1
704 end if
705 this%host_buffer_count = this%host_buffer_count - 1
707 if (this%device_buffer_count == 1 .or. force_) then
708 if (copy_) then
709 select case (target)
710 ! unpack from packed_device to unpacked_host
711 case (batch_not_packed)
713 ! unpack from packed_device to packed_host
714 case (batch_packed)
715 call batch_read_device_to_packed(this, async_)
716 end select
717 end if
718 if (async_) then
719 this%needs_finish_unpack = .true.
720 else
721 call this%deallocate_packed_device()
722 end if
723 this%status_of = target
724 this%device_buffer_count = 1
725 end if
726 this%device_buffer_count = this%device_buffer_count - 1
727 end select
728 end if
729
730 call profiling_out("BATCH_DO_UNPACK")
731
732 pop_sub(batch_do_unpack)
733 end subroutine batch_do_unpack
734
735 ! ----------------------------------------------------
737 subroutine batch_finish_unpack(this)
738 class(batch_t), intent(inout) :: this
739
740 push_sub(batch_finish_unpack)
741 if (this%needs_finish_unpack) then
742 call accel_finish()
743 call this%deallocate_packed_device()
744 this%needs_finish_unpack = .false.
745 end if
746 pop_sub(batch_finish_unpack)
747 end subroutine batch_finish_unpack
749 ! ----------------------------------------------------
750
751 subroutine batch_write_unpacked_to_device(this)
752 class(batch_t), intent(inout) :: this
753
754 integer :: ist, ist2
755 integer(int64) :: unroll
756 type(accel_mem_t) :: tmp
757 type(accel_kernel_t), pointer :: kernel
758
760
761 call profiling_in("BATCH_WRT_UNPACK_ACCEL")
762 if (this%nst_linear == 1) then
763 ! we can copy directly
764 if (this%type() == type_float) then
765 call accel_write_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1))
766 else if (this%type() == type_cmplx) then
767 call accel_write_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1))
768 else
769 assert(.false.)
770 end if
771
772 else
773 ! we copy to a temporary array and then we re-arrange data
774
775 if (this%type() == type_float) then
776 kernel => dpack
777 else
778 kernel => zpack
779 end if
780
781 unroll = min(cl_pack_max_buffer_size, this%pack_size(1))
782
783 call accel_create_buffer(tmp, accel_mem_read_only, this%type(), unroll*this%pack_size(2))
784
785 do ist = 1, this%nst_linear, int(unroll, int32)
786
787 ! copy a number 'unroll' of states to the buffer
788 do ist2 = ist, min(ist + int(unroll, int32) - 1, this%nst_linear)
789
790 if (this%type() == type_float) then
791 call accel_write_buffer(tmp, ubound(this%dff_linear, dim=1, kind=int64), this%dff_linear(:, ist2), &
792 offset = (ist2 - ist)*this%pack_size(2))
793 else
794 call accel_write_buffer(tmp, ubound(this%zff_linear, dim=1, kind=int64), this%zff_linear(:, ist2), &
795 offset = (ist2 - ist)*this%pack_size(2))
796 end if
797 end do
798
799 ! now call an opencl kernel to rearrange the data
800 call accel_set_kernel_arg(kernel, 0, int(this%pack_size(1), int32))
801 call accel_set_kernel_arg(kernel, 1, int(this%pack_size(2), int32))
802 call accel_set_kernel_arg(kernel, 2, ist - 1)
803 call accel_set_kernel_arg(kernel, 3, tmp)
804 call accel_set_kernel_arg(kernel, 4, this%ff_device)
805
806 call profiling_in("CL_PACK")
807 call accel_kernel_run(kernel, (/this%pack_size(2), unroll/), (/accel_max_workgroup_size()/unroll, unroll/))
808
809 if (this%type() == type_float) then
810 call profiling_count_transfers(unroll*this%pack_size(2), m_one)
811 else
812 call profiling_count_transfers(unroll*this%pack_size(2), m_zi)
813 end if
814
815 call accel_finish()
816 call profiling_out("CL_PACK")
817
818 end do
819
820 call accel_release_buffer(tmp)
821
822 end if
823
824 call profiling_out("BATCH_WRT_UNPACK_ACCEL")
826 end subroutine batch_write_unpacked_to_device
827
828 ! ------------------------------------------------------------------
829
831 class(batch_t), intent(inout) :: this
832
833 integer :: ist, ist2
834 integer(int64) :: unroll
835 type(accel_mem_t) :: tmp
836 type(accel_kernel_t), pointer :: kernel
837
839 call profiling_in("BATCH_READ_UNPACKED_ACCEL")
840
841 if (this%nst_linear == 1) then
842 ! we can copy directly
843 if (this%type() == type_float) then
844 call accel_read_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1))
845 else
846 call accel_read_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1))
847 end if
848 else
849
850 unroll = min(cl_pack_max_buffer_size, this%pack_size(1))
851
852 ! we use a kernel to move to a temporary array and then we read
853 call accel_create_buffer(tmp, accel_mem_write_only, this%type(), unroll*this%pack_size(2))
854
855 if (this%type() == type_float) then
856 kernel => dunpack
857 else
858 kernel => zunpack
859 end if
860
861 do ist = 1, this%nst_linear, int(unroll, int32)
862 call accel_set_kernel_arg(kernel, 0, int(this%pack_size(1), int32))
863 call accel_set_kernel_arg(kernel, 1, int(this%pack_size(2), int32))
864 call accel_set_kernel_arg(kernel, 2, ist - 1)
865 call accel_set_kernel_arg(kernel, 3, this%ff_device)
866 call accel_set_kernel_arg(kernel, 4, tmp)
867
868 call profiling_in("CL_UNPACK")
869 call accel_kernel_run(kernel, (/unroll, this%pack_size(2)/), (/unroll, accel_max_workgroup_size()/unroll/))
870
871 if (this%type() == type_float) then
872 call profiling_count_transfers(unroll*this%pack_size(2), m_one)
873 else
874 call profiling_count_transfers(unroll*this%pack_size(2), m_zi)
875 end if
876
877 call accel_finish()
878 call profiling_out("CL_UNPACK")
879
880 ! copy a number 'unroll' of states from the buffer
881 do ist2 = ist, min(ist + int(unroll, int32) - 1, this%nst_linear)
882
883 if (this%type() == type_float) then
884 call accel_read_buffer(tmp, ubound(this%dff_linear, dim=1, kind=int64), this%dff_linear(:, ist2), &
885 offset = (ist2 - ist)*this%pack_size(2))
886 else
887 call accel_read_buffer(tmp, ubound(this%zff_linear, dim=1, kind=int64), this%zff_linear(:, ist2), &
888 offset = (ist2 - ist)*this%pack_size(2))
889 end if
890 end do
891
892 end do
893
894 call accel_release_buffer(tmp)
895 end if
896
897 call profiling_out("BATCH_READ_UNPACKED_ACCEL")
899 end subroutine batch_read_device_to_unpacked
900
901 ! ------------------------------------------------------------------
902 subroutine batch_write_packed_to_device(this, async)
903 class(batch_t), intent(inout) :: this
904 logical, optional, intent(in) :: async
905
906
908
909 call profiling_in("BATCH_WRITE_PACKED_ACCEL")
910 if (this%type() == type_float) then
911 call accel_write_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async)
912 else
913 call accel_write_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async)
914 end if
915 call profiling_out("BATCH_WRITE_PACKED_ACCEL")
916
918 end subroutine batch_write_packed_to_device
919
920 ! ------------------------------------------------------------------
921 subroutine batch_read_device_to_packed(this, async)
922 class(batch_t), intent(inout) :: this
923 logical, optional, intent(in) :: async
924
925
927
928 call profiling_in("BATCH_READ_PACKED_ACCEL")
929 if (this%type() == type_float) then
930 call accel_read_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async)
931 else
932 call accel_read_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async)
933 end if
934 call profiling_out("BATCH_READ_PACKED_ACCEL")
935
937 end subroutine batch_read_device_to_packed
938
939 ! ------------------------------------------------------
944 integer function batch_inv_index(this, cind) result(index)
945 class(batch_t), intent(in) :: this
946 integer, intent(in) :: cind(:)
947
948 do index = 1, this%nst_linear
949 if (all(cind(1:this%ndims) == this%ist_idim_index(index, 1:this%ndims))) exit
950 end do
951
952 assert(index <= this%nst_linear)
953
954 end function batch_inv_index
955
956 ! ------------------------------------------------------
960 !
961 integer pure function batch_ist_idim_to_linear(this, cind) result(index)
962 class(batch_t), intent(in) :: this
963 integer, intent(in) :: cind(:)
964
965 if (ubound(cind, dim = 1) == 1) then
966 index = cind(1)
967 else
968 index = (cind(1) - 1)*this%dim + cind(2)
969 end if
970
971 end function batch_ist_idim_to_linear
972
973 ! ------------------------------------------------------
978 !
979 integer pure function batch_linear_to_ist(this, linear_index) result(ist)
980 class(batch_t), intent(in) :: this
981 integer, intent(in) :: linear_index
982
983 ist = this%ist_idim_index(linear_index, 1)
984
985 end function batch_linear_to_ist
986
987 ! ------------------------------------------------------
989 !
990 integer pure function batch_linear_to_idim(this, linear_index) result(idim)
991 class(batch_t), intent(in) :: this
992 integer, intent(in) :: linear_index
993
994 idim = this%ist_idim_index(linear_index, 2)
996 end function batch_linear_to_idim
997
998 ! ------------------------------------------------------
1007 !
1008 subroutine batch_remote_access_start(this, mpi_grp, rma_win)
1009 class(batch_t), intent(inout) :: this
1010 type(mpi_grp_t), intent(in) :: mpi_grp
1011 type(mpi_win), intent(out) :: rma_win
1012
1015 assert(.not. accel_is_enabled())
1016
1017 if (mpi_grp%size > 1) then
1018 call this%do_pack()
1019
1020 if (this%type() == type_cmplx) then
1021#ifdef HAVE_MPI
1022 call mpi_win_create(this%zff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), mpi_address_kind), &
1023 types_get_size(this%type()), mpi_info_null, mpi_grp%comm, rma_win, mpi_err)
1024#endif
1025 else if (this%type() == type_float) then
1026#ifdef HAVE_MPI
1027 call mpi_win_create(this%dff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), mpi_address_kind), &
1028 types_get_size(this%type()), mpi_info_null, mpi_grp%comm, rma_win, mpi_err)
1029#endif
1030 else
1031 message(1) = "Internal error: unknown batch type in batch_remote_access_start."
1032 call messages_fatal(1)
1033 end if
1034
1035 else
1036 rma_win = mpi_win_null
1037 end if
1038
1040 end subroutine batch_remote_access_start
1041
1042 ! ------------------------------------------------------
1048 !
1049 subroutine batch_remote_access_stop(this, rma_win)
1050 class(batch_t), intent(inout) :: this
1051 type(mpi_win), intent(inout) :: rma_win
1052
1053 push_sub(batch_remote_access_stop)
1055 if (rma_win /= mpi_win_null) then
1056#ifdef HAVE_MPI
1057 call mpi_win_free(rma_win, mpi_err)
1058#endif
1059 call this%do_unpack()
1060 end if
1061
1063 end subroutine batch_remote_access_stop
1064
1065 ! --------------------------------------------------------------
1067 !
1068 subroutine batch_copy_data_to(this, np, dest, async)
1069 class(batch_t), intent(in) :: this
1070 integer, intent(in) :: np
1071 class(batch_t), intent(inout) :: dest
1072 logical, optional, intent(in) :: async
1073
1074 integer(int64) :: localsize, dim2, dim3
1075 integer :: ist, ip
1076
1077 push_sub(batch_copy_data_to)
1078 call profiling_in("BATCH_COPY_DATA_TO")
1079
1080 call this%check_compatibility_with(dest)
1081
1082 select case (this%status())
1084 call accel_set_kernel_arg(kernel_copy, 0, np)
1085 call accel_set_kernel_arg(kernel_copy, 1, this%ff_device)
1086 call accel_set_kernel_arg(kernel_copy, 2, log2(int(this%pack_size_real(1), int32)))
1087 call accel_set_kernel_arg(kernel_copy, 3, dest%ff_device)
1088 call accel_set_kernel_arg(kernel_copy, 4, log2(int(dest%pack_size_real(1), int32)))
1089
1090 localsize = accel_kernel_workgroup_size(kernel_copy)/dest%pack_size_real(1)
1091
1092 dim3 = np/(accel_max_size_per_dim(2)*localsize) + 1
1093 dim2 = min(accel_max_size_per_dim(2)*localsize, pad(int(np, int64), localsize))
1094
1095 call accel_kernel_run(kernel_copy, (/dest%pack_size_real(1), dim2, dim3/), (/dest%pack_size_real(1), localsize, 1_int64/))
1096
1097 if(.not. optional_default(async, .false.)) call accel_finish()
1098
1099 case (batch_packed)
1100 if (np*this%pack_size(1) > huge(0_int32)) then
1101 ! BLAS cannot handle 8-byte integers, so we need a special version here
1102 do ip = 1, np
1103 if (dest%type() == type_float) then
1104 call blas_copy(int(this%pack_size(1), int32), this%dff_pack(1, ip), 1, dest%dff_pack(1, ip), 1)
1105 else
1106 call blas_copy(int(this%pack_size(1), int32), this%zff_pack(1, ip), 1, dest%zff_pack(1, ip), 1)
1107 end if
1108 end do
1109 else
1110 if (dest%type() == type_float) then
1111 call blas_copy(int(this%pack_size(1)*np, int32), this%dff_pack(1, 1), 1, dest%dff_pack(1, 1), 1)
1112 else
1113 call blas_copy(int(this%pack_size(1)*np, int32), this%zff_pack(1, 1), 1, dest%zff_pack(1, 1), 1)
1114 end if
1115 end if
1116
1117 case (batch_not_packed)
1118 do ist = 1, dest%nst_linear
1119 if (dest%type() == type_cmplx) then
1120 call blas_copy(np, this%zff_linear(1, ist), 1, dest%zff_linear(1, ist), 1)
1121 else
1122 call blas_copy(np, this%dff_linear(1, ist), 1, dest%dff_linear(1, ist), 1)
1123 end if
1124 end do
1125
1126 end select
1127
1128 call profiling_out("BATCH_COPY_DATA_TO")
1129 pop_sub(batch_copy_data_to)
1130 end subroutine batch_copy_data_to
1131
1132 ! --------------------------------------------------------------
1134 !
1135 subroutine batch_check_compatibility_with(this, target, only_check_dim)
1136 class(batch_t), intent(in) :: this
1137 class(batch_t), intent(in) :: target
1138 logical, optional, intent(in) :: only_check_dim
1139
1141
1142 assert(this%type() == target%type())
1143 if (.not. optional_default(only_check_dim, .false.)) then
1144 assert(this%nst_linear == target%nst_linear)
1145 end if
1146 assert(this%status() == target%status())
1147 assert(this%dim == target%dim)
1148
1150
1151 end subroutine batch_check_compatibility_with
1152
1153!--------------------------------------------------------------
1156 subroutine batch_build_indices(this, st_start, st_end)
1157 class(batch_t), intent(inout) :: this
1158 integer, intent(in) :: st_start
1159 integer, intent(in) :: st_end
1160
1161 integer :: idim, ii, ist
1162
1163 push_sub(batch_build_indices)
1164
1165 do ist = st_start, st_end
1166 ! now we also populate the linear array
1167 do idim = 1, this%dim
1168 ii = this%dim*(ist - st_start) + idim
1169 this%ist_idim_index(ii, 1) = ist
1170 this%ist_idim_index(ii, 2) = idim
1171 end do
1172 this%ist(ist - st_start + 1) = ist
1173 end do
1174
1175 ! compute packed sizes
1176 this%pack_size(1) = pad_pow2(this%nst_linear)
1177 this%pack_size(2) = this%np
1178 if (accel_is_enabled()) this%pack_size(2) = accel_padded_size(this%pack_size(2))
1179
1180 this%pack_size_real = this%pack_size
1181 if (type_is_complex(this%type())) this%pack_size_real(1) = 2*this%pack_size_real(1)
1182
1183 pop_sub(batch_build_indices)
1184 end subroutine batch_build_indices
1185
1186
1187#include "real.F90"
1188#include "batch_inc.F90"
1189#include "undef.F90"
1190
1191#include "complex.F90"
1192#include "batch_inc.F90"
1193#include "undef.F90"
1194
1195end module batch_oct_m
1196
1197!! Local Variables:
1198!! mode: f90
1199!! coding: utf-8
1200!! End:
initialize a batch with existing memory
Definition: batch.F90:267
double log2(double __x) __attribute__((__nothrow__
integer, parameter, public accel_mem_read_write
Definition: accel.F90:183
subroutine, public accel_release_buffer(this)
Definition: accel.F90:1246
This module contains interfaces for routines in allocate_hardware_aware.c.
subroutine, public deallocate_hardware_aware(array, size)
This module implements batches of mesh functions.
Definition: batch.F90:133
subroutine batch_check_compatibility_with(this, target, only_check_dim)
check whether two batches have compatible dimensions (and type)
Definition: batch.F90:1229
type(type_t) pure function batch_type(this)
return the type of a batch
Definition: batch.F90:612
subroutine zbatch_pack_copy(this)
copy data from the unpacked to the packed arrays
Definition: batch.F90:1813
integer, parameter, public batch_not_packed
functions are stored in CPU memory, unpacked order
Definition: batch.F90:276
integer, parameter, public batch_device_packed
functions are stored in device memory in packed order
Definition: batch.F90:276
subroutine zbatch_init_with_memory_3(this, dim, st_start, st_end, psi)
initialize a batch with an rank-3 array of TYPE_CMPLX valued mesh functions psi.
Definition: batch.F90:1637
integer, parameter cl_pack_max_buffer_size
this value controls the size (in number of wave-functions) of the buffer used to copy states to the o...
Definition: batch.F90:281
logical pure function batch_is_packed(this)
Definition: batch.F90:646
subroutine dbatch_unpack_copy(this)
copy data from the packed to the unpacked arrays
Definition: batch.F90:1538
subroutine dbatch_init_with_memory_1(this, psi)
initialize a batch with an rank-1 array of TYPE_FLOAT valued mesh functions psi.
Definition: batch.F90:1398
subroutine batch_write_unpacked_to_device(this)
Definition: batch.F90:845
subroutine batch_do_unpack(this, copy, force, async)
unpack a batch
Definition: batch.F90:749
subroutine batch_finish_unpack(this)
finish the unpacking if do_unpack() was called with async=.true.
Definition: batch.F90:831
subroutine zbatch_allocate_unpacked_host(this)
allocate host (CPU) memory for unpacked data of type TYPE_CMPLX
Definition: batch.F90:1724
subroutine batch_deallocate_packed_device(this)
release packed device memory
Definition: batch.F90:389
integer pure function batch_type_as_integer(this)
For debuging purpose only.
Definition: batch.F90:621
integer function batch_inv_index(this, cind)
inverse index lookup
Definition: batch.F90:1038
subroutine dbatch_init_with_memory_2(this, dim, st_start, st_end, psi)
initialize a batch with an rank-2 array of TYPE_FLOAT valued mesh functions psi.
Definition: batch.F90:1369
subroutine batch_allocate_packed_host(this)
allocate host (CPU) memory for packed data
Definition: batch.F90:423
subroutine, public zbatch_init(this, dim, st_start, st_end, np, special, packed)
initialize a TYPE_CMPLX valued batch to given size without providing external memory
Definition: batch.F90:1774
subroutine zbatch_allocate_packed_host(this)
allocate host (CPU) memory for packed data of type TYPE_CMPLX
Definition: batch.F90:1750
subroutine batch_remote_access_stop(this, rma_win)
stop the remote access to the batch
Definition: batch.F90:1143
subroutine batch_read_device_to_unpacked(this)
Definition: batch.F90:924
subroutine zbatch_init_with_memory_1(this, psi)
initialize a batch with an rank-1 array of TYPE_CMPLX valued mesh functions psi.
Definition: batch.F90:1703
subroutine dbatch_allocate_packed_host(this)
allocate host (CPU) memory for packed data of type TYPE_FLOAT
Definition: batch.F90:1445
subroutine batch_allocate_packed_device(this)
allocate device (GPU) memory for packed data
Definition: batch.F90:442
subroutine batch_build_indices(this, st_start, st_end)
build the index ist(:) and ist_idim_index(:,:) and set pack_size
Definition: batch.F90:1250
subroutine batch_copy_to(this, dest, pack, copy_data, new_np, special)
make a copy of a batch
Definition: batch.F90:557
integer pure function batch_ist_idim_to_linear(this, cind)
direct index lookup
Definition: batch.F90:1055
integer pure function batch_linear_to_ist(this, linear_index)
get state index ist from linear (combined dim and nst) index
Definition: batch.F90:1073
subroutine batch_clone_to_array(this, dest, n_batches, pack, copy_data)
Definition: batch.F90:525
subroutine, public batch_read_device_to_packed(this, async)
Definition: batch.F90:1015
subroutine batch_write_packed_to_device(this, async)
Definition: batch.F90:996
subroutine dbatch_init_with_memory_3(this, dim, st_start, st_end, psi)
initialize a batch with an rank-3 array of TYPE_FLOAT valued mesh functions psi.
Definition: batch.F90:1332
subroutine batch_allocate_unpacked_host(this)
allocate host (CPU) memory for unpacked data
Definition: batch.F90:404
subroutine batch_init_empty(this, dim, nst, np)
initialize an empty batch
Definition: batch.F90:460
subroutine, public dbatch_init(this, dim, st_start, st_end, np, special, packed)
initialize a TYPE_FLOAT valued batch to given size without providing external memory
Definition: batch.F90:1469
subroutine batch_clone_to(this, dest, pack, copy_data, new_np)
clone a batch to a new batch
Definition: batch.F90:500
subroutine zbatch_init_with_memory_2(this, dim, st_start, st_end, psi)
initialize a batch with an rank-2 array of TYPE_CMPLX valued mesh functions psi.
Definition: batch.F90:1674
integer pure function batch_linear_to_idim(this, linear_index)
extract idim from linear index
Definition: batch.F90:1084
subroutine batch_remote_access_start(this, mpi_grp, rma_win)
start remote access to a batch on another node
Definition: batch.F90:1102
subroutine batch_copy_data_to(this, np, dest, async)
copy data to another batch.
Definition: batch.F90:1162
subroutine batch_do_pack(this, copy, async)
pack the data in a batch
Definition: batch.F90:670
subroutine dbatch_allocate_unpacked_host(this)
allocate host (CPU) memory for unpacked data of type TYPE_FLOAT
Definition: batch.F90:1419
subroutine dbatch_pack_copy(this)
copy data from the unpacked to the packed arrays
Definition: batch.F90:1508
integer pure function batch_status(this)
return the status of a batch
Definition: batch.F90:638
subroutine batch_deallocate_unpacked_host(this)
release unpacked host memory
Definition: batch.F90:333
integer, parameter, public batch_packed
functions are stored in CPU memory, in transposed (packed) order
Definition: batch.F90:276
subroutine batch_deallocate_packed_host(this)
release packed host memory
Definition: batch.F90:364
integer(int64) function batch_pack_total_size(this)
Definition: batch.F90:654
subroutine batch_end(this, copy)
finalize a batch and release allocated memory, if necessary
Definition: batch.F90:295
subroutine zbatch_unpack_copy(this)
copy data from the packed to the unpacked arrays
Definition: batch.F90:1843
This module contains interfaces for BLAS routines You should not use these routines directly....
Definition: blas.F90:118
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:115
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:420
type(type_t), public type_float
Definition: types.F90:133
type(type_t), public type_cmplx
Definition: types.F90:134
type(type_t), public type_none
Definition: types.F90:132
Class defining batches of mesh functions.
Definition: batch.F90:159
int true(void)