main/doxygen_doc/operate__antisym__inc_8c_source.html

/*

 Copyright (C) 2006 X. Andrade

 Copyright (C) 2025 N. Tancogne-Dejean


 This program is free software; you can redistribute it and/or modify

 it under the terms of the GNU General Public License as published by

 the Free Software Foundation; either version 2, or (at your option)

 any later version.


 This program is distributed in the hope that it will be useful,

 but WITHOUT ANY WARRANTY; without even the implied warranty of

 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 GNU General Public License for more details.


 You should have received a copy of the GNU General Public License

 along with this program; if not, write to the Free Software

 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA

 02110-1301, USA.


*/


#ifdef ALIGNED

#define LOAD VEC_LD

#define STORE VEC_ST

#else

#define LOAD VEC_LDU

#define STORE VEC_STU

#endif


{

  const ptrdiff_t npairs = opn[0];

  const ptrdiff_t nri = opnri[0];

#if DEPTH >= 16

  const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);

#endif

#if DEPTH >= 8

  const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);

#endif

#if DEPTH >= 4

  const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);

#endif

#if DEPTH >= 2

  const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);

#endif

#if DEPTH >= 1

  const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);

#endif


  ptrdiff_t l, i, j;

  const int *restrict index_pos;

  const int *restrict index_neg;


  for (l = 0; l < nri; l++) {

    index_pos = opri_pos + npairs * l;

    index_neg = opri_neg + npairs * l;

    i = rimap_inv[l];


#if DEPTH >= 16

    for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {

      ptrdiff_t k;

      for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {

        register VEC_TYPE a0, a1, a2, a3;

        register VEC_TYPE a4, a5, a6, a7;

        register VEC_TYPE a8, a9, aa, ab;

        register VEC_TYPE ac, ad, ae, af;


        a0 = a1 = a2 = a3 = VEC_ZERO;

        a4 = a5 = a6 = a7 = VEC_ZERO;

        a8 = a9 = aa = ab = VEC_ZERO;

        ac = ad = ae = af = VEC_ZERO;


        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;


        for (j = 0; j < npairs; j++) {

          register VEC_TYPE wj = VEC_SCAL(wpair[j]);

          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;

          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;

          register VEC_TYPE pos = LOAD(fi + indexj_pos);

          register VEC_TYPE neg = LOAD(fi + indexj_neg);

          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);

          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);

          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);

          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);

          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);

          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);

          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);

          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);

          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);

          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);

          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);

          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);

          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);

          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);

          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);

          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);

          a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);

          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);

          a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);

          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);

          aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);

          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);

          ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);

          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);

          ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);

          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);

          ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);

          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);

          ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);

          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);

          af = VEC_FMA(wj, VEC_SUB(pos, neg), af);


        }

        STORE(fo + base_ik, a0);

        STORE(fo + base_ik + 1 * VEC_SIZE, a1);

        STORE(fo + base_ik + 2 * VEC_SIZE, a2);

        STORE(fo + base_ik + 3 * VEC_SIZE, a3);

        STORE(fo + base_ik + 4 * VEC_SIZE, a4);

        STORE(fo + base_ik + 5 * VEC_SIZE, a5);

        STORE(fo + base_ik + 6 * VEC_SIZE, a6);

        STORE(fo + base_ik + 7 * VEC_SIZE, a7);

        STORE(fo + base_ik + 8 * VEC_SIZE, a8);

        STORE(fo + base_ik + 9 * VEC_SIZE, a9);

        STORE(fo + base_ik + 10 * VEC_SIZE, aa);

        STORE(fo + base_ik + 11 * VEC_SIZE, ab);

        STORE(fo + base_ik + 12 * VEC_SIZE, ac);

        STORE(fo + base_ik + 13 * VEC_SIZE, ad);

        STORE(fo + base_ik + 14 * VEC_SIZE, ae);

        STORE(fo + base_ik + 15 * VEC_SIZE, af);

      }

    }

#endif


#if DEPTH >= 8

    for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {

      ptrdiff_t k;

      for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {

        register VEC_TYPE a0, a1, a2, a3;

        register VEC_TYPE a4, a5, a6, a7;


        a0 = a1 = a2 = a3 = VEC_ZERO;

        a4 = a5 = a6 = a7 = VEC_ZERO;


        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;


        for (j = 0; j < npairs; j++) {

          register VEC_TYPE wj = VEC_SCAL(wpair[j]);

          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;

          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;

          register VEC_TYPE pos = LOAD(fi + indexj_pos);

          register VEC_TYPE neg = LOAD(fi + indexj_neg);

          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);

          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);

          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);

          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);

          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);

          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);

          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);

          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);

          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);

          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);

          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);

          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);

          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);

          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);

          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);

        }

        STORE(fo + base_ik, a0);

        STORE(fo + base_ik + 1 * VEC_SIZE, a1);

        STORE(fo + base_ik + 2 * VEC_SIZE, a2);

        STORE(fo + base_ik + 3 * VEC_SIZE, a3);

        STORE(fo + base_ik + 4 * VEC_SIZE, a4);

        STORE(fo + base_ik + 5 * VEC_SIZE, a5);

        STORE(fo + base_ik + 6 * VEC_SIZE, a6);

        STORE(fo + base_ik + 7 * VEC_SIZE, a7);

      }

    }

#endif


#if DEPTH >= 4

    for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {

      ptrdiff_t k;

      for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {

        register VEC_TYPE a0, a1, a2, a3;


        a0 = a1 = a2 = a3 = VEC_ZERO;


        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;


        for (j = 0; j < npairs; j++) {

          register VEC_TYPE wj = VEC_SCAL(wpair[j]);

          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;

          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;

          register VEC_TYPE pos = LOAD(fi + indexj_pos);

          register VEC_TYPE neg = LOAD(fi + indexj_neg);

          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);

          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);

          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);

          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);

          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);

          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);

          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);

        }

        STORE(fo + base_ik, a0);

        STORE(fo + base_ik + 1 * VEC_SIZE, a1);

        STORE(fo + base_ik + 2 * VEC_SIZE, a2);

        STORE(fo + base_ik + 3 * VEC_SIZE, a3);

      }

    }

#endif


#if DEPTH >= 2

    for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {

      ptrdiff_t k;

      for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {

        register VEC_TYPE a0, a1;


        a0 = a1 = VEC_ZERO;


        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;


        for (j = 0; j < npairs; j++) {

          register VEC_TYPE wj = VEC_SCAL(wpair[j]);

          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;

          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;

          register VEC_TYPE pos = LOAD(fi + indexj_pos);

          register VEC_TYPE neg = LOAD(fi + indexj_neg);

          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);

          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);

          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);

          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);

        }

        STORE(fo + base_ik, a0);

        STORE(fo + base_ik + 1 * VEC_SIZE, a1);

      }

    }

#endif


#if DEPTH >= 1

    for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {

      ptrdiff_t k;

      for (k = 0; k < (1 << ldf); k += VEC_SIZE) {

        register VEC_TYPE a0 = VEC_ZERO;

        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;

        for (j = 0; j < npairs; j++) {

          register VEC_TYPE wj = VEC_SCAL(wpair[j]);

          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;

          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;

          register VEC_TYPE pos = LOAD(fi + indexj_pos);

          register VEC_TYPE neg = LOAD(fi + indexj_neg);

          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);

        }

        STORE(fo + base_ik, a0);

      }

    }

#endif


#if VEC_SIZE > 1


    const ptrdiff_t size =  (ptrdiff_t) 1 << ldf;

    double a;


    for (; i < rimap_inv_max[l]; i++) {

      for (ptrdiff_t k = 0; k < size; k++) {

        a = 0.0;

        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;

        for (j = 0; j < npairs; j++) {

          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;

          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;

          const double diff = fi[indexj_pos] - fi[indexj_neg];

          a += wpair[j] * diff;

        }

        fo[base_ik] = a;

      }

    }

#endif


  } /* l */


  // this fence instruction is needed to ensure correctness when using

  // non-temporal stores

#if defined(ALIGNED) && defined(FENCE)

  FENCE;

#endif

}


#undef LOAD

#undef STORE

ptrdiff_t
long int ptrdiff_t
Definition: operate.c:15

index_neg
const int *restrict index_neg
Definition: operate_antisym_inc.c:14

nri
const ptrdiff_t nri
Definition: operate_antisym_inc.c:10

l
ptrdiff_t l
Definition: operate_antisym_inc.c:12

i
ptrdiff_t i
Definition: operate_antisym_inc.c:12

j
ptrdiff_t j
Definition: operate_antisym_inc.c:12

index_pos
const int *restrict index_pos
Definition: operate_antisym_inc.c:13