Octopus
batch_ops.F90
Go to the documentation of this file.
1!! Copyright (C) 2008 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
23module batch_ops_oct_m
24 use accel_oct_m
25 use batch_oct_m
26 use blas_oct_m
27 use debug_oct_m
28 use iso_c_binding
29 use global_oct_m
31 use math_oct_m
34 use types_oct_m
35
36 implicit none
37
38 private
39 public :: &
41 batch_axpy, &
43 batch_scal, &
45 batch_xpay, &
62
64 interface batch_axpy
65 module procedure dbatch_axpy_const
66 module procedure zbatch_axpy_const
67 module procedure dbatch_axpy_vec
68 module procedure zbatch_axpy_vec
69 end interface batch_axpy
70
72 interface batch_scal
73 module procedure dbatch_scal_const
74 module procedure zbatch_scal_const
75 module procedure dbatch_scal_vec
76 module procedure zbatch_scal_vec
77 end interface batch_scal
78
80 interface batch_axpby
81 module procedure dbatch_axpby
82 module procedure zbatch_axpby
83 end interface batch_axpby
84
86 interface batch_scal2v
87 module procedure dbatch_scal2v
88 module procedure zbatch_scal2v
89 end interface batch_scal2v
90
92 interface batch_xpay
93 module procedure dbatch_xpay_vec
94 module procedure zbatch_xpay_vec
95 module procedure dbatch_xpay_const
96 module procedure zbatch_xpay_const
97 end interface batch_xpay
98
99 interface batch_add_with_map
100 module procedure batch_add_with_map_cpu
101 module procedure batch_add_with_map_accel
102 end interface batch_add_with_map
103
104 interface batch_copy_with_map
105 module procedure batch_copy_with_map_cpu
106 module procedure batch_copy_with_map_accel
107 end interface batch_copy_with_map
108
123 interface batch_set_state
124 module procedure dbatch_set_state1
125 module procedure zbatch_set_state1
126 module procedure dbatch_set_state2
127 module procedure zbatch_set_state2
128 module procedure dbatch_set_state3
129 module procedure zbatch_set_state3
130 end interface batch_set_state
131
132 interface batch_get_state
133 module procedure dbatch_get_state1
134 module procedure zbatch_get_state1
135 module procedure dbatch_get_state2
136 module procedure zbatch_get_state2
137 module procedure dbatch_get_state3
138 module procedure zbatch_get_state3
139 end interface batch_get_state
140
141 interface batch_get_points
142 module procedure dbatch_get_points
143 module procedure zbatch_get_points
144 module procedure batch_get_points_accel
145 end interface batch_get_points
146
147 interface batch_set_points
148 module procedure dbatch_set_points
149 module procedure zbatch_set_points
150 module procedure batch_set_points_accel
151 end interface batch_set_points
152
153 interface batch_pointwise_mul
154 module procedure batch_mul_cj
155 end interface batch_pointwise_mul
156
159 interface batch_mul_mf
160 module procedure dbatch_mul_mf
161 module procedure zbatch_mul_mf
162 end interface batch_mul_mf
163
164
165contains
166
167 !--------------------------------------------------------------
169 subroutine batch_set_zero(this, np, async)
170 class(batch_t), intent(inout) :: this
171 integer, optional, intent(in) :: np
173 logical, optional, intent(in) :: async
174
175 integer :: ist_linear, ist, ip, np_
176
177 push_sub(batch_set_zero)
178
179 assert(not_in_openmp())
180
181 call profiling_in("BATCH_SET_ZERO")
182
183 select case (this%status())
185 np_ = optional_default(np, int(this%pack_size(2), int32))
186 assert(np_ <= int(this%pack_size(2), int32))
187 call accel_set_buffer_to_zero(this%ff_device, this%type(), (int(this%pack_size(1), int32) * np_), async=async)
188
189 case (batch_packed)
190 np_ = optional_default(np, int(this%pack_size(2), int32))
191 assert(np_ <= int(this%pack_size(2), int32))
192 if (this%type() == type_float) then
193 !$omp parallel do private(ist) schedule(static)
194 do ip = 1, np_
195 !$omp simd
196 do ist = 1, int(this%pack_size(1), int32)
197 this%dff_pack(ist, ip) = m_zero
198 end do
199 end do
200 else
201 !$omp parallel do private(ist) schedule(static)
202 do ip = 1, np_
203 !$omp simd
204 do ist = 1, int(this%pack_size(1), int32)
205 this%zff_pack(ist, ip) = m_z0
206 end do
207 end do
208 end if
209
210 case (batch_not_packed)
211 if (this%type() == type_float) then
212 np_ = optional_default(np, ubound(this%dff_linear, dim=1))
213 assert(np_ <= ubound(this%dff_linear, dim=1))
214 do ist_linear = 1, this%nst_linear
215 !$omp parallel do schedule(static)
216 do ip = 1, np_
217 this%dff_linear(ip, ist_linear) = m_zero
218 end do
219 end do
220 else
221 np_ = optional_default(np, ubound(this%zff_linear, dim=1))
222 assert(np_ <= ubound(this%zff_linear, dim=1))
223 do ist_linear = 1, this%nst_linear
224 !$omp parallel do schedule(static)
225 do ip = 1, np_
226 this%zff_linear(ip, ist_linear) = m_z0
227 end do
228 end do
229 end if
230
231 case default
232 message(1) = "batch_set_zero: unknown batch status."
233 call messages_fatal(1)
234
235 end select
237 call profiling_out("BATCH_SET_ZERO")
238
239 pop_sub(batch_set_zero)
240 end subroutine batch_set_zero
241
242 ! --------------------------------------------------------------
244 !
245 subroutine batch_get_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
246 class(batch_t), intent(in) :: this
247 integer, intent(in) :: sp
248 integer, intent(in) :: ep
249 type(accel_mem_t), intent(inout) :: psi
250 integer, intent(in) :: ldpsi1
251 integer, intent(in) :: ldpsi2
252
253 integer :: tsize, ii, it
254 type(accel_kernel_t), save :: kernel
255 integer, allocatable :: linear_to_ist(:), linear_to_idim(:)
256 type(accel_mem_t) :: buff_linear_to_ist, buff_linear_to_idim
257
258 push_sub(batch_get_points_accel)
259 call profiling_in("GET_POINTS")
260
261 select case (this%status())
263 call messages_not_implemented('batch_get_points_accel for non-CL batches')
266
267 tsize = types_get_size(this%type())/types_get_size(type_float)
268 safe_allocate(linear_to_ist(1:this%nst_linear*tsize))
269 safe_allocate(linear_to_idim(1:this%nst_linear*tsize))
270 do ii = 1, this%nst_linear
271 do it = 1, tsize
272 linear_to_ist(tsize*(ii-1)+it) = tsize*(this%linear_to_ist(ii) - 1) + it - 1
273 linear_to_idim(tsize*(ii-1)+it) = this%linear_to_idim(ii) - 1
274 end do
275 end do
276
277 call accel_create_buffer(buff_linear_to_ist, accel_mem_read_only, type_integer, this%nst_linear*tsize)
278 call accel_write_buffer(buff_linear_to_ist, this%nst_linear*tsize, linear_to_ist)
279 call accel_create_buffer(buff_linear_to_idim, accel_mem_read_only, type_integer, this%nst_linear*tsize)
280 call accel_write_buffer(buff_linear_to_idim, this%nst_linear*tsize, linear_to_idim)
281
282 call accel_kernel_start_call(kernel, 'points.cu', 'get_points')
283
284 call accel_set_kernel_arg(kernel, 0, sp)
285 call accel_set_kernel_arg(kernel, 1, ep)
286 call accel_set_kernel_arg(kernel, 2, buff_linear_to_ist)
287 call accel_set_kernel_arg(kernel, 3, buff_linear_to_idim)
288 call accel_set_kernel_arg(kernel, 4, this%nst_linear*tsize)
289 call accel_set_kernel_arg(kernel, 5, this%ff_device)
290 call accel_set_kernel_arg(kernel, 6, int(this%pack_size_real(1), int32))
291 call accel_set_kernel_arg(kernel, 7, psi)
292 call accel_set_kernel_arg(kernel, 8, ldpsi1*tsize)
293 call accel_set_kernel_arg(kernel, 9, ldpsi2)
294
295 call accel_kernel_run(kernel, (/1_int64, int(ep - sp + 1, int64)/), (/this%pack_size_real(1), 1_int64/))
296
297 call accel_free_buffer(buff_linear_to_ist)
298 call accel_free_buffer(buff_linear_to_idim)
299 safe_deallocate_a(linear_to_ist)
300 safe_deallocate_a(linear_to_idim)
301
302 end select
303
304 call profiling_out("GET_POINTS")
305
307 end subroutine batch_get_points_accel
308
309 ! --------------------------------------------------------------
311 !
312 subroutine batch_set_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
313 class(batch_t), intent(inout) :: this
314 integer, intent(in) :: sp
315 integer, intent(in) :: ep
316 type(accel_mem_t), intent(in) :: psi
317 integer, intent(in) :: ldpsi1
318 integer, intent(in) :: ldpsi2
319
320 integer :: tsize, ii, it
321 type(accel_kernel_t), save :: kernel
322 integer, allocatable :: linear_to_ist(:), linear_to_idim(:)
323 type(accel_mem_t) :: buff_linear_to_ist, buff_linear_to_idim
324
325 push_sub(batch_set_points_accel)
326 call profiling_in("SET_POINTS")
327
328 select case (this%status())
330 call messages_not_implemented('batch_set_points_accel for non-CL batches')
331
333
334 tsize = types_get_size(this%type())/types_get_size(type_float)
335 safe_allocate(linear_to_ist(1:this%nst_linear*tsize))
336 safe_allocate(linear_to_idim(1:this%nst_linear*tsize))
337 do ii = 1, this%nst_linear
338 do it = 1, tsize
339 linear_to_ist(tsize*(ii-1)+it) = tsize*(this%linear_to_ist(ii) - 1) + it - 1
340 linear_to_idim(tsize*(ii-1)+it) = this%linear_to_idim(ii) - 1
341 end do
342 end do
343
344 call accel_create_buffer(buff_linear_to_ist, accel_mem_read_only, type_integer, this%nst_linear*tsize)
345 call accel_write_buffer(buff_linear_to_ist, this%nst_linear*tsize, linear_to_ist)
346 call accel_create_buffer(buff_linear_to_idim, accel_mem_read_only, type_integer, this%nst_linear*tsize)
347 call accel_write_buffer(buff_linear_to_idim, this%nst_linear*tsize, linear_to_idim)
348
349 call accel_kernel_start_call(kernel, 'points.cu', 'set_points')
350
351 call accel_set_kernel_arg(kernel, 0, sp)
352 call accel_set_kernel_arg(kernel, 1, ep)
353 call accel_set_kernel_arg(kernel, 2, buff_linear_to_ist)
354 call accel_set_kernel_arg(kernel, 3, buff_linear_to_idim)
355 call accel_set_kernel_arg(kernel, 4, this%nst_linear*tsize)
356 call accel_set_kernel_arg(kernel, 5, psi)
357 call accel_set_kernel_arg(kernel, 6, ldpsi1*tsize)
358 call accel_set_kernel_arg(kernel, 7, ldpsi2)
359 call accel_set_kernel_arg(kernel, 8, this%ff_device)
360 call accel_set_kernel_arg(kernel, 9, int(this%pack_size_real(1), int32))
361
362 call accel_kernel_run(kernel, (/1_int64, int(ep - sp + 1, int64)/), (/this%pack_size_real(1), 1_int64/))
363
364 call accel_free_buffer(buff_linear_to_ist)
365 call accel_free_buffer(buff_linear_to_idim)
366 safe_deallocate_a(linear_to_ist)
367 safe_deallocate_a(linear_to_idim)
368
369 end select
370
371 call profiling_out("SET_POINTS")
372
374 end subroutine batch_set_points_accel
375
376 ! -------------------------
380 !
381 integer pure function batch_points_block_size() result(block_size)
382
383 block_size = 61440
384
385 end function batch_points_block_size
386
387! -------------------------
390 subroutine batch_mul_cj(np, xx, yy, zz, conjugate_yy)
391 integer, intent(in) :: np
392 class(batch_t), intent(in) :: xx
393 class(batch_t), intent(in) :: yy
394 class(batch_t), intent(inout) :: zz
395 logical, optional, intent(in) :: conjugate_yy
396
397 integer :: ii, ip
398 logical :: conj_yy
399 integer(int64), dimension(3) :: gsizes, bsizes
400 type(accel_kernel_t), save :: kernel
401
402 push_sub(batch_mul_cj)
403
404 call xx%check_compatibility_with(yy)
405 call xx%check_compatibility_with(zz)
406
407 conj_yy = optional_default(conjugate_yy, .true.)
408
409 select case (xx%status())
410 case (batch_not_packed)
411 if (xx%type() == type_cmplx) then
412 if (conj_yy) then
413 !$omp parallel private(ii, ip)
414 do ii = 1, xx%nst_linear
415 !$omp do schedule(static)
416 do ip = 1, np
417 zz%zff_linear(ip, ii) = xx%zff_linear(ip, ii)*conjg(yy%zff_linear(ip, ii))
418 end do
419 !$omp end do
420 end do
421 !$omp end parallel
422 else
423 !$omp parallel private(ii, ip)
424 do ii = 1, xx%nst_linear
425 !$omp do schedule(static)
426 do ip = 1, np
427 zz%zff_linear(ip, ii) = xx%zff_linear(ip, ii)*yy%zff_linear(ip, ii)
428 end do
429 !$omp end do
430 end do
431 !$omp end parallel
432 end if
433 else
434 !$omp parallel private(ii, ip)
435 do ii = 1, xx%nst_linear
436 !$omp do schedule(static)
437 do ip = 1, np
438 zz%dff_linear(ip, ii) = xx%dff_linear(ip, ii)*yy%dff_linear(ip, ii)
439 end do
440 !$omp end do
441 end do
442 !$omp end parallel
443 end if
444
445 case (batch_packed)
446 if (xx%type() == type_cmplx) then
447 if (conj_yy) then
448 !$omp parallel do private(ii)
449 do ip = 1, np
450 !$omp simd
451 do ii = 1, xx%nst_linear
452 zz%zff_pack(ii, ip) = xx%zff_pack(ii, ip)*conjg(yy%zff_pack(ii, ip))
453 end do
454 end do
455 else
456 !$omp parallel do private(ii)
457 do ip = 1, np
458 !$omp simd
459 do ii = 1, xx%nst_linear
460 zz%zff_pack(ii, ip) = xx%zff_pack(ii, ip)*yy%zff_pack(ii, ip)
461 end do
462 end do
463 end if
464 else
465 !$omp parallel do private(ii)
466 do ip = 1, np
467 !$omp simd
468 do ii = 1, xx%nst_linear
469 zz%dff_pack(ii, ip) = xx%dff_pack(ii, ip)*yy%dff_pack(ii, ip)
470 end do
471 end do
472 end if
473
474 case (batch_device_packed)
475 if (xx%type() == type_cmplx) then
476 if (conj_yy) then
477 call accel_kernel_start_call(kernel, 'batch_mul.cu', 'zmul_conj')
478 else
479 call accel_kernel_start_call(kernel, 'batch_mul.cu', 'zmul')
480 end if
481
482 call accel_set_kernel_arg(kernel, 0, np)
483 call accel_set_kernel_arg(kernel, 1, xx%ff_device)
484 call accel_set_kernel_arg(kernel, 2, log2(int(xx%pack_size(1), int32)))
485 call accel_set_kernel_arg(kernel, 3, yy%ff_device)
486 call accel_set_kernel_arg(kernel, 4, log2(int(yy%pack_size(1), int32)))
487 call accel_set_kernel_arg(kernel, 5, zz%ff_device)
488 call accel_set_kernel_arg(kernel, 6, log2(int(zz%pack_size(1), int32)))
489
490 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size(1), gsizes, bsizes, kernel)
491 else
492 call accel_kernel_start_call(kernel, 'batch_mul.cu', 'dmul')
493
494 call accel_set_kernel_arg(kernel, 0, np)
495 call accel_set_kernel_arg(kernel, 1, xx%ff_device)
496 call accel_set_kernel_arg(kernel, 2, log2(int(xx%pack_size_real(1), int32)))
497 call accel_set_kernel_arg(kernel, 3, yy%ff_device)
498 call accel_set_kernel_arg(kernel, 4, log2(int(yy%pack_size_real(1), int32)))
499 call accel_set_kernel_arg(kernel, 5, zz%ff_device)
500 call accel_set_kernel_arg(kernel, 6, log2(int(zz%pack_size_real(1), int32)))
501
502 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)
503 end if
504
505 call accel_kernel_run(kernel, gsizes, bsizes)
506 end select
507
508 pop_sub(batch_mul_cj)
509 end subroutine batch_mul_cj
510
511! -------------------------
512 subroutine batch_add_with_map_cpu(np, map, xx, yy, zz)
513 integer, intent(in) :: np
514 integer, intent(in) :: map(:)
515 class(batch_t), intent(in) :: xx
516 class(batch_t), intent(in) :: yy
517 class(batch_t), intent(inout) :: zz
518 type(accel_mem_t) :: buff_map
519
520 push_sub(batch_add_with_map_cpu)
521
522 if (xx%status() /= batch_device_packed) then
523 if (xx%type() == type_float) then
524 call dbatch_add_with_map(np, map, xx, yy, zz)
525 else
526 call zbatch_add_with_map(np, map, xx, yy, zz)
527 end if
528 else
529 ! copy map to GPU if not already there
530 call accel_create_buffer(buff_map, accel_mem_read_only, type_integer, np)
531 call accel_write_buffer(buff_map, np, map)
532 call batch_add_with_map_accel(np, buff_map, xx, yy, zz)
533 call accel_free_buffer(buff_map)
534 end if
535
537 end subroutine batch_add_with_map_cpu
538
539! -------------------------
540 subroutine batch_add_with_map_accel(np, map, xx, yy, zz)
541 integer, intent(in) :: np
542 class(accel_mem_t), intent(in) :: map
543 class(batch_t), intent(in) :: xx
544 class(batch_t), intent(in) :: yy
545 class(batch_t), intent(inout) :: zz
546
547 type(accel_kernel_t), save :: kernel
548 integer(int64), dimension(3) :: gsizes, bsizes
549
551
552 call accel_kernel_start_call(kernel, 'copy.cu', 'add_with_map')
553
554 call accel_set_kernel_arg(kernel, 0, np)
555 call accel_set_kernel_arg(kernel, 1, map)
556 call accel_set_kernel_arg(kernel, 2, xx%ff_device)
557 call accel_set_kernel_arg(kernel, 3, log2(int(xx%pack_size_real(1), int32)))
558 call accel_set_kernel_arg(kernel, 4, yy%ff_device)
559 call accel_set_kernel_arg(kernel, 5, log2(int(yy%pack_size_real(1), int32)))
560 call accel_set_kernel_arg(kernel, 6, zz%ff_device)
561 call accel_set_kernel_arg(kernel, 7, log2(int(zz%pack_size_real(1), int32)))
562
563 ! Compute the grid (extend to another dimensions if the size of the problem is too big)
564 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)
565
566 call accel_kernel_run(kernel, gsizes, bsizes)
567
569 end subroutine batch_add_with_map_accel
570
571! -------------------------
572 subroutine batch_copy_with_map_cpu(np, map, xx, yy)
573 integer, intent(in) :: np
574 integer, intent(in) :: map(:)
575 class(batch_t), intent(in) :: xx
576 class(batch_t), intent(inout) :: yy
577 type(accel_mem_t) :: buff_map
578
580
581 if (xx%status() /= batch_device_packed) then
582 if (xx%type() == type_float) then
583 call dbatch_copy_with_map(np, map, xx, yy)
584 else
585 call zbatch_copy_with_map(np, map, xx, yy)
586 end if
587 else
588 ! copy map to GPU if not already there
589 call accel_create_buffer(buff_map, accel_mem_read_only, type_integer, np)
590 call accel_write_buffer(buff_map, np, map)
591 call batch_copy_with_map_accel(np, buff_map, xx, yy)
592 call accel_free_buffer(buff_map)
593 end if
594
596 end subroutine batch_copy_with_map_cpu
597
598! -------------------------
599 subroutine batch_copy_with_map_accel(np, map, xx, yy)
600 integer, intent(in) :: np
601 class(accel_mem_t), intent(in) :: map
602 class(batch_t), intent(in) :: xx
603 class(batch_t), intent(inout) :: yy
604
605 type(accel_kernel_t), save :: kernel
606 integer(int64), dimension(3) :: gsizes, bsizes
609
610 call accel_kernel_start_call(kernel, 'copy.cu', 'copy_with_map')
611
612 ! execute only if map has at least one element
613 if (np > 0) then
614 call accel_set_kernel_arg(kernel, 0, np)
615 call accel_set_kernel_arg(kernel, 1, map)
616 call accel_set_kernel_arg(kernel, 2, xx%ff_device)
617 call accel_set_kernel_arg(kernel, 3, log2(int(xx%pack_size_real(1), int32)))
618 call accel_set_kernel_arg(kernel, 4, yy%ff_device)
619 call accel_set_kernel_arg(kernel, 5, log2(int(yy%pack_size_real(1), int32)))
620
621 ! Compute the grid (extend to another dimensions if the size of the problem is too big)
622 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size_real(1), gsizes, bsizes, kernel)
623
624 call accel_kernel_run(kernel, gsizes, bsizes)
625 end if
626
628 end subroutine batch_copy_with_map_accel
629
630 ! -------------------------
634 !
635 subroutine batch_split_complex(np, xx, yy, zz)
636 integer, intent(in) :: np
637 class(batch_t), intent(in) :: xx
638 class(batch_t), intent(inout) :: yy
639 class(batch_t), intent(inout) :: zz
640
641 integer :: ist_linear, ip
642 type(accel_kernel_t), save :: kernel
643 integer(int64), dimension(3) :: gsizes, bsizes
644
645 push_sub(batch_split_complex)
646
647 assert(xx%type() == type_cmplx)
648 assert(yy%type() == type_float)
649 assert(zz%type() == type_float)
650 assert(xx%status() == yy%status())
651 assert(xx%status() == zz%status())
652
653 select case (xx%status())
654 case (batch_not_packed)
655 do ist_linear = 1, xx%nst_linear
656 !$omp parallel do schedule(static)
657 do ip = 1, np
658 yy%dff_linear(ip, ist_linear) = real(xx%zff_linear(ip, ist_linear), real64)
659 zz%dff_linear(ip, ist_linear) = aimag(xx%zff_linear(ip, ist_linear))
660 end do
661 end do
662 case (batch_packed)
663 !$omp parallel do private(ist_linear) schedule(static)
664 do ip = 1, np
665 do ist_linear = 1, xx%nst_linear
666 yy%dff_pack(ist_linear, ip) = real(xx%zff_pack(ist_linear, ip), real64)
667 zz%dff_pack(ist_linear, ip) = aimag(xx%zff_pack(ist_linear, ip))
668 end do
669 end do
670 case (batch_device_packed)
671 call accel_kernel_start_call(kernel, 'split.cu', 'split_complex')
672
673 call accel_set_kernel_arg(kernel, 0, int(xx%pack_size(2), int32))
674 call accel_set_kernel_arg(kernel, 1, xx%ff_device)
675 call accel_set_kernel_arg(kernel, 2, log2(int(xx%pack_size(1), int32)))
676 call accel_set_kernel_arg(kernel, 3, yy%ff_device)
677 call accel_set_kernel_arg(kernel, 4, log2(int(yy%pack_size(1), int32)))
678 call accel_set_kernel_arg(kernel, 5, zz%ff_device)
679 call accel_set_kernel_arg(kernel, 6, log2(int(zz%pack_size(1), int32)))
680
681 ! Compute the grid (extend to another dimensions if the size of the problem is too big)
682 call accel_grid_size_extend_dim(int(np, int64), xx%pack_size(1), gsizes, bsizes, kernel)
683
684 call accel_kernel_run(kernel, gsizes, bsizes)
685 end select
686
687 pop_sub(batch_split_complex)
688 end subroutine batch_split_complex
689
690#include "real.F90"
691#include "batch_ops_inc.F90"
692#include "undef.F90"
693
694#include "complex.F90"
695#include "batch_ops_inc.F90"
696#include "undef.F90"
697
698end module batch_ops_oct_m
699
700!! Local Variables:
701!! mode: f90
702!! coding: utf-8
703!! End:
batchified version of the BLAS axpy routine:
Definition: batch_ops.F90:159
batchified multiplication by mesh function with optional conjugation:
Definition: batch_ops.F90:254
batchified scale with optional conjugation:
Definition: batch_ops.F90:181
scale a batch by a constant or vector
Definition: batch_ops.F90:167
There are several ways how to call batch_set_state and batch_get_state:
Definition: batch_ops.F90:218
batchified version of
Definition: batch_ops.F90:187
double log2(double __x) __attribute__((__nothrow__
subroutine, public accel_free_buffer(this, async)
Definition: accel.F90:1005
subroutine, public accel_kernel_start_call(this, file_name, kernel_name, flags)
Definition: accel.F90:1413
integer, parameter, public accel_mem_read_only
Definition: accel.F90:185
This module implements batches of mesh functions.
Definition: batch.F90:135
integer, parameter, public batch_not_packed
functions are stored in CPU memory, unpacked order
Definition: batch.F90:286
integer, parameter, public batch_device_packed
functions are stored in device memory in packed order
Definition: batch.F90:286
integer, parameter, public batch_packed
functions are stored in CPU memory, in transposed (packed) order
Definition: batch.F90:286
This module implements common operations on batches of mesh functions.
Definition: batch_ops.F90:118
subroutine zbatch_get_state3(this, ii, np, psi, async)
Definition: batch_ops.F90:4016
subroutine dbatch_get_state3(this, ii, np, psi, async)
Definition: batch_ops.F90:2306
subroutine batch_copy_with_map_accel(np, map, xx, yy)
Definition: batch_ops.F90:695
subroutine zbatch_get_state1(this, ist, np, psi, async)
Write a get of state with np points from a batch.
Definition: batch_ops.F90:3864
subroutine, public zbatch_copy_with_map_to_array(np, map, xx, array)
Transfer a batch from the mesh to an array on the submesh (defined by a map)
Definition: batch_ops.F90:4288
subroutine, public dbatch_ax_function_py(np, aa, psi, yy)
This routine performs a set of axpy operations adding the same function psi to all functions of a bat...
Definition: batch_ops.F90:1173
subroutine dbatch_set_state1(this, ist, np, psi)
Write a single state with np points into a batch at position ist.
Definition: batch_ops.F90:2016
subroutine dbatch_axpy_const(np, aa, xx, yy)
This routine applies a 'pair-wise' axpy operation to all functions of the batches xx and yy,...
Definition: batch_ops.F90:834
subroutine zbatch_axpby(np, aa, xx, bb, yy)
calculate yy(ist,:) = aa*xx(ist,:) + bb*yy(ist,:) for a batch
Definition: batch_ops.F90:3407
subroutine zbatch_get_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:4037
subroutine dbatch_xpay_vec(np, xx, aa, yy, a_start, a_full)
calculate yy(ist,:) = xx(ist,:) + aa(ist)*yy(ist,:) for a batch
Definition: batch_ops.F90:1427
subroutine dbatch_scal_vec(np, aa, xx, a_start, a_full)
scale all functions in a batch by state dependent constant
Definition: batch_ops.F90:1293
subroutine dbatch_set_state2(this, index, np, psi)
Write a single state with np points into a batch at position defined by index.
Definition: batch_ops.F90:2129
subroutine dbatch_copy_with_map(np, map, xx, yy)
Definition: batch_ops.F90:2534
subroutine, public dbatch_copy_with_map_to_array(np, map, xx, array)
Transfer a batch from the mesh to an array on the submesh (defined by a map)
Definition: batch_ops.F90:2577
subroutine dbatch_scal2v(np, aa, xx, yy, conjugate_xx)
calculate yy(ist,:) = aa*CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:1720
subroutine dbatch_get_state2(this, index, np, psi, async)
Definition: batch_ops.F90:2288
subroutine dbatch_axpy_vec(np, aa, xx, yy, a_start, a_full)
This routine applies an 'pair-wise' axpy operation to all functions of the batches xx and yy,...
Definition: batch_ops.F90:919
subroutine dbatch_set_state3(this, ii, np, psi)
Write a set of state with np points into a batch.
Definition: batch_ops.F90:2147
subroutine zbatch_axpy_const(np, aa, xx, yy)
This routine applies a 'pair-wise' axpy operation to all functions of the batches xx and yy,...
Definition: batch_ops.F90:2699
subroutine zbatch_scal_const(np, aa, xx)
scale all functions in a batch by constant aa
Definition: batch_ops.F90:3105
subroutine zbatch_get_state2(this, index, np, psi, async)
Definition: batch_ops.F90:3998
subroutine zbatch_xpay_const(np, xx, aa, yy)
calculate yy(ist) = xx(ist) + aa*yy(ist) for a batch
Definition: batch_ops.F90:3376
subroutine, public batch_split_complex(np, xx, yy, zz)
extract the real and imaginary parts of a complex batch
Definition: batch_ops.F90:731
subroutine batch_add_with_map_accel(np, map, xx, yy, zz)
Definition: batch_ops.F90:636
subroutine dbatch_set_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:2407
subroutine dbatch_get_state1(this, ist, np, psi, async)
Write a get of state with np points from a batch.
Definition: batch_ops.F90:2167
subroutine zbatch_copy_with_map(np, map, xx, yy)
Definition: batch_ops.F90:4245
subroutine dbatch_mul_mf(np, ff, xx, yy, conjugate_xx)
calculate yy(ist,:) = ff(:) * CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:1864
subroutine, public batch_set_zero(this, np, async)
fill all mesh functions of the batch with zero
Definition: batch_ops.F90:265
subroutine, public zbatch_ax_function_py(np, aa, psi, yy)
This routine performs a set of axpy operations adding the same function psi to all functions of a bat...
Definition: batch_ops.F90:3021
subroutine zbatch_axpy_vec(np, aa, xx, yy, a_start, a_full)
This routine applies an 'pair-wise' axpy operation to all functions of the batches xx and yy,...
Definition: batch_ops.F90:2784
subroutine batch_set_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
GPU version of batch_set_points.
Definition: batch_ops.F90:408
subroutine dbatch_xpay_const(np, xx, aa, yy)
calculate yy(ist) = xx(ist) + aa*yy(ist) for a batch
Definition: batch_ops.F90:1570
subroutine dbatch_axpby(np, aa, xx, bb, yy)
calculate yy(ist,:) = aa*xx(ist,:) + bb*yy(ist,:) for a batch
Definition: batch_ops.F90:1601
subroutine dbatch_get_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:2327
subroutine zbatch_scal2v(np, aa, xx, yy, conjugate_xx)
calculate yy(ist,:) = aa*CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:3494
integer pure function, public batch_points_block_size()
determine the device block size
Definition: batch_ops.F90:477
subroutine dbatch_add_with_map(np, map, xx, yy, zz)
Definition: batch_ops.F90:2492
subroutine zbatch_mul_mf(np, ff, xx, yy, conjugate_xx)
calculate yy(ist,:) = ff(:) * CJ(xx(ist,:)) for a batch
Definition: batch_ops.F90:3607
subroutine zbatch_xpay_vec(np, xx, aa, yy, a_start, a_full)
calculate yy(ist,:) = xx(ist,:) + aa(ist)*yy(ist,:) for a batch
Definition: batch_ops.F90:3258
subroutine batch_mul_cj(np, xx, yy, zz, conjugate_yy)
Point-wise multiply two batches with optional conjugation on yy: zz_i = xx_i * CJ(yy_i)
Definition: batch_ops.F90:486
subroutine zbatch_set_state1(this, ist, np, psi)
Write a single state with np points into a batch at position ist.
Definition: batch_ops.F90:3726
subroutine batch_add_with_map_cpu(np, map, xx, yy, zz)
Definition: batch_ops.F90:608
subroutine zbatch_scal_vec(np, aa, xx, a_start, a_full)
scale all functions in a batch by state dependent constant
Definition: batch_ops.F90:3141
subroutine zbatch_set_state2(this, index, np, psi)
Write a single state with np points into a batch at position defined by index.
Definition: batch_ops.F90:3826
subroutine, public zbatch_axpy_function(np, aa, xx, psi, nst)
This routine performs a set of axpy operations for each function x of a batch (xx),...
Definition: batch_ops.F90:2915
subroutine zbatch_set_points(this, sp, ep, psi)
copy a set of points into a mesh function
Definition: batch_ops.F90:4129
subroutine zbatch_add_with_map(np, map, xx, yy, zz)
Definition: batch_ops.F90:4203
subroutine, public dbatch_axpy_function(np, aa, xx, psi, nst)
This routine performs a set of axpy operations for each function x of a batch (xx),...
Definition: batch_ops.F90:1067
subroutine batch_get_points_accel(this, sp, ep, psi, ldpsi1, ldpsi2)
GPU version of batch_get_points.
Definition: batch_ops.F90:341
subroutine zbatch_set_state3(this, ii, np, psi)
Write a set of state with np points into a batch.
Definition: batch_ops.F90:3844
subroutine batch_copy_with_map_cpu(np, map, xx, yy)
Definition: batch_ops.F90:668
subroutine dbatch_scal_const(np, aa, xx)
scale all functions in a batch by constant aa
Definition: batch_ops.F90:1257
This module contains interfaces for BLAS routines You should not use these routines directly....
Definition: blas.F90:120
real(real64), parameter, public m_zero
Definition: global.F90:200
logical pure function, public not_in_openmp()
Definition: global.F90:566
complex(real64), parameter, public m_z0
Definition: global.F90:210
This module is intended to contain "only mathematical" functions and procedures.
Definition: math.F90:117
subroutine, public messages_not_implemented(feature, namespace)
Definition: messages.F90:1068
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:162
subroutine, public messages_fatal(no_lines, only_root_writes, namespace)
Definition: messages.F90:410
subroutine, public profiling_out(label)
Increment out counter and sum up difference between entry and exit time.
Definition: profiling.F90:631
subroutine, public profiling_in(label, exclude)
Increment in counter and save entry time.
Definition: profiling.F90:554
type(type_t), public type_float
Definition: types.F90:135
type(type_t), public type_integer
Definition: types.F90:137
integer pure function, public types_get_size(this)
Definition: types.F90:154
Class defining batches of mesh functions.
Definition: batch.F90:161
int true(void)