Batches
In many situations, we need to perform the same operations over many mesh functions, such as the electronic wave functions. It is therefore advantageous to group those functions into one object. This can ensure that different mesh functions are contiguous in memory.
Due to the nature of stencil operations, which constitute a large part of the low level operations on mesh functions, it is often more efficient to perform the same stencil operation over different mesh functions (i.e. using the state index as fast index), than looping first over the mesh index, which would, in general, require a different stencil for each mesh point. This is, in particular, the case for calculations utilizing GPUs.
Therefore, we store mesh functions in linear or in so-called packed form. The former refers to the ‘natural’ ordering where the mesh index is the fastest moving, while the latter is transposed.
The abstract class batch_t
is the parent class for batches, such as electronic wave functions.
type batch_t
private
integer, public :: nst !< number of functions in the batch
integer, public :: dim !< Spinor dimension of the state (one, or two for spinors)
integer :: np !< number of points in each function (this can be np or np_part)
integer :: ndims !< The second dimension of ist_idim_index(:,:). Currently always set to 2.
integer, allocatable :: ist_idim_index(:, :)
!< @brief index mapping fom global (ist,idim) to local ist.
!!
!! This maps ist and idim into one linear array.
!! This index is constructed in batch_oct_m::batch_build_indices
integer, allocatable, public :: ist(:) !< @brief map from an global to local index
!!
!! The global index does not need to start at 1, while
!! the local index is always in the range 1:nst.
!!
!! This index is constructed in batch_oct_m::batch_build_indices
logical :: is_allocated !< indicate allocation status
logical :: own_memory !< does the batch own the memory or is it foreign memory?
! We also need a linear array with the states in order to calculate derivatives, etc.
integer, public :: nst_linear !< nst_linear = nst * st%d%dim
integer :: status_of !< @brief packing status of the batch
!!
!! possible values are:
!! BATCH_NOT_PACKED, BATCH_PACKED, BATCH_DEVICE_PACKED
integer :: status_host !< @brief packing status in CPU memory
!!
!! If Octopus runs on GPU, this indicates the status on the CPU.
!! It can only be BATCH_NOT_PACKED and BATCH_PACKED.
!! This makes transfers more efficient: usually we allocate a
!! batch as packed on the CPU, then call do_pack to copy it to the GPU.
!! In this case, it is really a copy.
!! If the batch is unpacked on the CPU, we need to transpose in
!! addition which makes it much slower.
type(type_t) :: type_of !< either TYPE_FLOAT or TYPE_CMPLX
integer :: device_buffer_count !< keep track of pack operations performed on the device
integer :: host_buffer_count !< keep track of pack operations performed on the host
logical :: special_memory !< are we using hardware-aware memory?
logical :: needs_finish_unpack !< if .true., async unpacking has started and needs be finished
! unpacked variables; linear variables are pointers with different shapes
FLOAT, pointer, contiguous, public :: dff(:, :, :) !< pointer to real mesh functions: indices are (1:np, 1:dim, 1:nst)
CMPLX, pointer, contiguous, public :: zff(:, :, :) !< pointer to complex mesh functions: indices are (1:np, 1:dim, 1:nst)
FLOAT, pointer, contiguous, public :: dff_linear(:, :) !< pointer to real mesh functions: indices are (1:np, 1:nst_linear)
CMPLX, pointer, contiguous, public :: zff_linear(:, :) !< pointer to complex mesh functions: indices are (1:np, 1:nst_linear)
! packed variables; only rank-2 arrays due to padding to powers of 2
FLOAT, pointer, contiguous, public :: dff_pack(:, :) !< pointer to real mesh functions: indices are (1:nst_linear, 1:np)
CMPLX, pointer, contiguous, public :: zff_pack(:, :) !< pointer to complex mesh functions: indices are (1:nst_linear, 1:np)
integer(int64), public :: pack_size(1:2) !< pack_size = [pad_pow2(nst_linear), np]
!! (see math_oct_m::pad_pow2)
integer(int64), public :: pack_size_real(1:2) !< pack_size_real = pack_size;
!! if batch type is complex, then
!! pack_size_real(1) = 2*pack_size(1)
type(accel_mem_t), public :: ff_device !< pointer to device memory
contains
procedure :: check_compatibility_with => batch_check_compatibility_with !< @copydoc batch_oct_m::batch_check_compatibility_with
procedure :: clone_to => batch_clone_to !< @copydoc batch_oct_m::batch_clone_to
procedure :: clone_to_array => batch_clone_to_array !< @copydoc batch_oct_m::batch_clone_to_array
procedure :: copy_to => batch_copy_to !< @copydoc batch_oct_m::batch_copy_to
procedure :: copy_data_to => batch_copy_data_to !< @copydoc batch_oct_m::batch_copy_data_to
procedure :: do_pack => batch_do_pack !< @copydoc batch_oct_m::batch_do_pack
procedure :: do_unpack => batch_do_unpack !< @copydoc batch_oct_m::batch_do_unpack
procedure :: finish_unpack => batch_finish_unpack !< @copydoc batch_oct_m::batch_finish_unpack
procedure :: end => batch_end !< @copydoc batch_oct_m::batch_end
procedure :: inv_index => batch_inv_index !< @copydoc batch_oct_m::batch_inv_index
procedure :: is_packed => batch_is_packed !< @copydoc batch_oct_m::batch_is_packed
procedure :: ist_idim_to_linear => batch_ist_idim_to_linear !< @copydoc batch_oct_m::batch_ist_idim_to_linear
procedure :: linear_to_idim => batch_linear_to_idim !< @copydoc batch_oct_m::batch_linear_to_idim
procedure :: linear_to_ist => batch_linear_to_ist !< @copydoc batch_oct_m::batch_linear_to_ist
procedure :: pack_total_size => batch_pack_total_size !< @copydoc batch_oct_m::batch_pack_total_size
procedure :: remote_access_start => batch_remote_access_start !< @copydoc batch_oct_m::batch_remote_access_start
procedure :: remote_access_stop => batch_remote_access_stop !< @copydoc batch_oct_m::batch_remote_access_stop
procedure :: status => batch_status !< @copydoc batch_oct_m::batch_status
procedure :: type => batch_type !< @copydoc batch_oct_m::batch_type
procedure :: type_as_int => batch_type_as_integer !< @copydoc batch_oct_m::batch_type_as_integer
procedure, private :: dallocate_unpacked_host => dbatch_allocate_unpacked_host
!< @copydoc batch_oct_m::dbatch_allocate_unpacked_host
procedure, private :: zallocate_unpacked_host => zbatch_allocate_unpacked_host
!< @copydoc batch_oct_m::zbatch_allocate_unpacked_host
procedure, private :: allocate_unpacked_host => batch_allocate_unpacked_host
!< @copydoc batch_oct_m::batch_allocate_unpacked_host
procedure, private :: dallocate_packed_host => dbatch_allocate_packed_host
!< @copydoc batch_oct_m::dbatch_allocate_packed_host
procedure, private :: zallocate_packed_host => zbatch_allocate_packed_host
!< @copydoc batch_oct_m::zbatch_allocate_packed_host
procedure, private :: allocate_packed_host => batch_allocate_packed_host
!< @copydoc batch_oct_m::batch_allocate_packed_host
procedure, private :: allocate_packed_device => batch_allocate_packed_device
!< @copydoc batch_oct_m::batch_allocate_packed_device
procedure, private :: deallocate_unpacked_host => batch_deallocate_unpacked_host
!< @copydoc batch_oct_m::batch_deallocate_unpacked_host
procedure, private :: deallocate_packed_host => batch_deallocate_packed_host
!< @copydoc batch_oct_m::batch_deallocate_packed_host
procedure, private :: deallocate_packed_device => batch_deallocate_packed_device
!< @copydoc batch_oct_m::batch_deallocate_packed_device
end type batch_t
This class includes information about the dimensions of the functions (number of states, spatial dimension and number of mesh points), but also internal book-keeping variables,
keeping track of the status of the batch. Furthermore, the batch_t
data type contains pointers to the actual data arrays, and defines the methods for interacting with a batch.
Empty batches can be initialized with:
subroutine X(batch_init)(this, dim, st_start, st_end, np, special, packed)
class(batch_t), intent(inout) :: this !< the batch to initialize
integer, intent(in) :: dim !< Spinor dimension of the state (one, or two for spinors)
integer, intent(in) :: st_start !< index of first state of the batch
integer, intent(in) :: st_end !< index of last state of the batch
integer, intent(in) :: np !< number of points in each function
!! (this can be np or np_part)
logical, optional, intent(in) :: special !< If .true., the allocation will be handled in C
!! (to use pinned memory for GPUs). Default = .false.
logical, optional, intent(in) :: packed !< If .true. the batch will be initialized in packed form.
!! Default = .false.
PUSH_SUB(X(batch_init))
call batch_init_empty(this, dim, st_end - st_start + 1, np)
this%special_memory = optional_default(special, .false.)
this%type_of = R_TYPE_VAL
call batch_build_indices(this, st_start, st_end)
if (optional_default(packed, .false.)) then
call this%X(allocate_packed_host)()
this%status_of = BATCH_PACKED
this%status_host = BATCH_PACKED
this%host_buffer_count = this%host_buffer_count + 1
else
call this%X(allocate_unpacked_host)()
end if
this%own_memory = .true.
POP_SUB(X(batch_init))
end subroutine X(batch_init)
subroutine X(batch_init_with_memory_1)(this, psi)
class(batch_t), intent(out) :: this
R_TYPE, target, contiguous, intent(in) :: psi(:)
R_TYPE, pointer, contiguous :: psip(:, :, :)
PUSH_SUB(X(batch_init_with_memory_1))
psip(1:ubound(psi, dim=1), 1:1, 1:1) => psi(:)
call X(batch_init_with_memory_3)(this, 1, 1, 1, psip)
POP_SUB(X(batch_init_with_memory_1))
end subroutine X(batch_init_with_memory_1)
subroutine X(batch_init_with_memory_2)(this, dim, st_start, st_end, psi)
class(batch_t), intent(out) :: this
integer, intent(in) :: dim
integer, intent(in) :: st_start
integer, intent(in) :: st_end
R_TYPE, target, contiguous, intent(in) :: psi(:, :)
R_TYPE, pointer, contiguous :: psip(:, :, :)
PUSH_SUB(X(batch_init_with_memory_2))
ASSERT(st_end == st_start .or. dim == 1)
psip(1:ubound(psi, dim=1), 1:dim, st_start:st_end) => psi(:, :)
call X(batch_init_with_memory_3)(this, dim, st_start, st_end, psip)
POP_SUB(X(batch_init_with_memory_2))
end subroutine X(batch_init_with_memory_2)
subroutine X(batch_init_with_memory_3)(this, dim, st_start, st_end, psi)
class(batch_t), intent(out) :: this
integer, intent(in) :: dim
integer, intent(in) :: st_start
integer, intent(in) :: st_end
R_TYPE, target, contiguous, intent(in) :: psi(:, :, st_start:)
PUSH_SUB(X(batch_init_with_memory_3))
ASSERT(st_end >= st_start)
call batch_init_empty(this, dim, st_end - st_start + 1, ubound(psi, dim=1))
this%type_of = R_TYPE_VAL
this%X(ff) => psi(:, :, st_start:)
this%X(ff_linear)(1:this%np, 1:this%nst_linear) => this%X(ff)
ASSERT(ubound(psi, dim=3) >= st_end)
ASSERT(ubound(psi, dim=2) == dim)
call batch_build_indices(this, st_start, st_end)
POP_SUB(X(batch_init_with_memory_3))
end subroutine X(batch_init_with_memory_3)
type, extends(batch_t) :: wfs_elec_t
private
integer, public :: ik !< @brief index of the k-point for this set of wave functions
!!
!! The kpoints themselves are stored in as kpoints_oct_m::kpoints_t in
!! hamiltonian_elec_oct_m::hamiltonian_elec_t::kpoints
logical, public :: has_phase !< @brief the stored wave functions include the phase
!!
!! A more detailed description of the phases is given in
!! hamiltonian_elec_base_oct_m::hamiltonian_elec_base_t
contains
procedure :: clone_to => wfs_elec_clone_to !< @copydoc wfs_elec_oct_m::wfs_elec_clone_to
procedure :: clone_to_array => wfs_elec_clone_to_array !< @copydoc wfs_elec_oct_m::wfs_elec_clone_to_array
procedure :: copy_to => wfs_elec_copy_to !< @copydoc wfs_elec_oct_m::wfs_elec_copy_to
procedure :: check_compatibility_with => wfs_elec_check_compatibility_with !< @copydoc wfs_elec_oct_m::wfs_elec_check_compatibility_with
procedure :: end => wfs_elec_end !< @copydoc wfs_elec_oct_m::wfs_elec_end
end type wfs_elec_t