33  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
 
   36  const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
 
   39  const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
 
   42  const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
 
   45  const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
 
   48  const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
 
   52  const int *restrict 
index;
 
   54  for (
l = 0; 
l < 
nri; 
l++) {
 
   59    for (; 
i < (rimap_inv_max[
l] - unroll32 + 1); 
i += unroll32) {
 
   61      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
 
   62        register VEC_TYPE a0, a1, a2, a3;
 
   63        register VEC_TYPE a4, a5, a6, a7;
 
   64        register VEC_TYPE a8, a9, aa, ab;
 
   65        register VEC_TYPE ac, ad, ae, af;
 
   66        register VEC_TYPE b0, b1, b2, b3;
 
   67        register VEC_TYPE b4, b5, b6, b7;
 
   68        register VEC_TYPE b8, b9, ba, bb;
 
   69        register VEC_TYPE bc, bd, be, bf;
 
   71        a0 = a1 = a2 = a3 = VEC_ZERO;
 
   72        a4 = a5 = a6 = a7 = VEC_ZERO;
 
   73        a8 = a9 = aa = ab = VEC_ZERO;
 
   74        ac = ad = ae = af = VEC_ZERO;
 
   75        b0 = b1 = b2 = b3 = VEC_ZERO;
 
   76        b4 = b5 = b6 = b7 = VEC_ZERO;
 
   77        b8 = b9 = ba = bb = VEC_ZERO;
 
   78        bc = bd = be = bf = VEC_ZERO;
 
   80        for (
j = 0; 
j < n; 
j++) {
 
   82          register VEC_TYPE wj = VEC_SCAL_LD(w + 
j);
 
   84          register VEC_TYPE wj = VEC_SCAL(w[
j]);
 
   87          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
 
   88          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
 
   89          a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
 
   90          a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
 
   91          a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
 
   92          a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
 
   93          a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
 
   94          a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
 
   95          a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
 
   96          a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
 
   97          aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
 
   98          ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
 
   99          ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
 
  100          ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
 
  101          ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
 
  102          af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
 
  103          b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0);
 
  104          b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1);
 
  105          b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2);
 
  106          b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3);
 
  107          b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4);
 
  108          b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5);
 
  109          b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6);
 
  110          b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7);
 
  111          b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8);
 
  112          b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9);
 
  113          ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba);
 
  114          bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb);
 
  115          bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc);
 
  116          bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd);
 
  117          be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be);
 
  118          bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf);
 
  120        STORE(fo + (
i << ldf) + k, a0);
 
  121        STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
 
  122        STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
 
  123        STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
 
  124        STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
 
  125        STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
 
  126        STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
 
  127        STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
 
  128        STORE(fo + (
i << ldf) + 8 * VEC_SIZE + k, a8);
 
  129        STORE(fo + (
i << ldf) + 9 * VEC_SIZE + k, a9);
 
  130        STORE(fo + (
i << ldf) + 10 * VEC_SIZE + k, aa);
 
  131        STORE(fo + (
i << ldf) + 11 * VEC_SIZE + k, ab);
 
  132        STORE(fo + (
i << ldf) + 12 * VEC_SIZE + k, ac);
 
  133        STORE(fo + (
i << ldf) + 13 * VEC_SIZE + k, ad);
 
  134        STORE(fo + (
i << ldf) + 14 * VEC_SIZE + k, ae);
 
  135        STORE(fo + (
i << ldf) + 15 * VEC_SIZE + k, af);
 
  136        STORE(fo + (
i << ldf) + 16 * VEC_SIZE + k, b0);
 
  137        STORE(fo + (
i << ldf) + 17 * VEC_SIZE + k, b1);
 
  138        STORE(fo + (
i << ldf) + 18 * VEC_SIZE + k, b2);
 
  139        STORE(fo + (
i << ldf) + 19 * VEC_SIZE + k, b3);
 
  140        STORE(fo + (
i << ldf) + 20 * VEC_SIZE + k, b4);
 
  141        STORE(fo + (
i << ldf) + 21 * VEC_SIZE + k, b5);
 
  142        STORE(fo + (
i << ldf) + 22 * VEC_SIZE + k, b6);
 
  143        STORE(fo + (
i << ldf) + 23 * VEC_SIZE + k, b7);
 
  144        STORE(fo + (
i << ldf) + 24 * VEC_SIZE + k, b8);
 
  145        STORE(fo + (
i << ldf) + 25 * VEC_SIZE + k, b9);
 
  146        STORE(fo + (
i << ldf) + 26 * VEC_SIZE + k, ba);
 
  147        STORE(fo + (
i << ldf) + 27 * VEC_SIZE + k, bb);
 
  148        STORE(fo + (
i << ldf) + 28 * VEC_SIZE + k, bc);
 
  149        STORE(fo + (
i << ldf) + 29 * VEC_SIZE + k, bd);
 
  150        STORE(fo + (
i << ldf) + 30 * VEC_SIZE + k, be);
 
  151        STORE(fo + (
i << ldf) + 31 * VEC_SIZE + k, bf);
 
  157    for (; 
i < (rimap_inv_max[
l] - unroll16 + 1); 
i += unroll16) {
 
  159      for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
 
  160        register VEC_TYPE a0, a1, a2, a3;
 
  161        register VEC_TYPE a4, a5, a6, a7;
 
  162        register VEC_TYPE a8, a9, aa, ab;
 
  163        register VEC_TYPE ac, ad, ae, af;
 
  165        a0 = a1 = a2 = a3 = VEC_ZERO;
 
  166        a4 = a5 = a6 = a7 = VEC_ZERO;
 
  167        a8 = a9 = aa = ab = VEC_ZERO;
 
  168        ac = ad = ae = af = VEC_ZERO;
 
  170        for (
j = 0; 
j < n; 
j++) {
 
  172          register VEC_TYPE wj = VEC_SCAL_LD(w + 
j);
 
  174          register VEC_TYPE wj = VEC_SCAL(w[
j]);
 
  177          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
 
  178          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
 
  179          a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
 
  180          a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
 
  181          a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
 
  182          a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
 
  183          a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
 
  184          a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
 
  185          a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
 
  186          a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
 
  187          aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
 
  188          ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
 
  189          ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
 
  190          ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
 
  191          ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
 
  192          af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
 
  194        STORE(fo + (
i << ldf) + k, a0);
 
  195        STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
 
  196        STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
 
  197        STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
 
  198        STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
 
  199        STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
 
  200        STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
 
  201        STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
 
  202        STORE(fo + (
i << ldf) + 8 * VEC_SIZE + k, a8);
 
  203        STORE(fo + (
i << ldf) + 9 * VEC_SIZE + k, a9);
 
  204        STORE(fo + (
i << ldf) + 10 * VEC_SIZE + k, aa);
 
  205        STORE(fo + (
i << ldf) + 11 * VEC_SIZE + k, ab);
 
  206        STORE(fo + (
i << ldf) + 12 * VEC_SIZE + k, ac);
 
  207        STORE(fo + (
i << ldf) + 13 * VEC_SIZE + k, ad);
 
  208        STORE(fo + (
i << ldf) + 14 * VEC_SIZE + k, ae);
 
  209        STORE(fo + (
i << ldf) + 15 * VEC_SIZE + k, af);
 
  215    for (; 
i < (rimap_inv_max[
l] - unroll8 + 1); 
i += unroll8) {
 
  217      for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
 
  218        register VEC_TYPE a0, a1, a2, a3;
 
  219        register VEC_TYPE a4, a5, a6, a7;
 
  221        a0 = a1 = a2 = a3 = VEC_ZERO;
 
  222        a4 = a5 = a6 = a7 = VEC_ZERO;
 
  224        for (
j = 0; 
j < n; 
j++) {
 
  226          register VEC_TYPE wj = VEC_SCAL_LD(w + 
j);
 
  228          register VEC_TYPE wj = VEC_SCAL(w[
j]);
 
  231          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
 
  232          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
 
  233          a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
 
  234          a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
 
  235          a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
 
  236          a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
 
  237          a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
 
  238          a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
 
  240        STORE(fo + (
i << ldf) + k, a0);
 
  241        STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
 
  242        STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
 
  243        STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
 
  244        STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
 
  245        STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
 
  246        STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
 
  247        STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
 
  253    for (; 
i < (rimap_inv_max[
l] - unroll4 + 1); 
i += unroll4) {
 
  255      for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
 
  256        register VEC_TYPE a0, a1, a2, a3;
 
  258        a0 = a1 = a2 = a3 = VEC_ZERO;
 
  260        for (
j = 0; 
j < n; 
j++) {
 
  262          register VEC_TYPE wj = VEC_SCAL_LD(w + 
j);
 
  264          register VEC_TYPE wj = VEC_SCAL(w[
j]);
 
  267          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
 
  268          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
 
  269          a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
 
  270          a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
 
  272        STORE(fo + (
i << ldf) + k, a0);
 
  273        STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
 
  274        STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
 
  275        STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
 
  281    for (; 
i < (rimap_inv_max[
l] - unroll2 + 1); 
i += unroll2) {
 
  283      for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
 
  284        register VEC_TYPE a0, a1;
 
  288        for (
j = 0; 
j < n; 
j++) {
 
  290          register VEC_TYPE wj = VEC_SCAL_LD(w + 
j);
 
  292          register VEC_TYPE wj = VEC_SCAL(w[
j]);
 
  295          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
 
  296          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
 
  298        STORE(fo + (
i << ldf) + k, a0);
 
  299        STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
 
  305    for (; 
i < (rimap_inv_max[
l] - unroll1 + 1); 
i += unroll1) {
 
  307      for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
 
  308        register VEC_TYPE a0 = VEC_ZERO;
 
  309        for (
j = 0; 
j < n; 
j++) {
 
  311          register VEC_TYPE wj = VEC_SCAL_LD(w + 
j);
 
  313          register VEC_TYPE wj = VEC_SCAL(w[
j]);
 
  316          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
 
  318        STORE(fo + (
i << ldf) + k, a0);
 
  324    for (; 
i < rimap_inv_max[
l]; 
i++) {
 
  326      for (k = 0; k < (1 << ldf); k++) {
 
  328        for (
j = 0; 
j < n; 
j++)
 
  329          a += w[
j] * fi[((
index[
j] + 
i) << ldf) + k];
 
  330        fo[(
i << ldf) + k] = a;
 
  339#if defined(ALIGNED) && defined(FENCE) 
const int *restrict index