33 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
36 const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
39 const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
42 const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
45 const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
49 const int *restrict
index;
51 for (
l = 0;
l <
nri;
l++) {
56 for (;
i < (rimap_inv_max[
l] - unroll16 + 1);
i += unroll16) {
58 for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
59 register VEC_TYPE a0, a1, a2, a3;
60 register VEC_TYPE a4, a5, a6, a7;
61 register VEC_TYPE a8, a9, aa, ab;
62 register VEC_TYPE ac, ad, ae, af;
64 a0 = a1 = a2 = a3 = VEC_ZERO;
65 a4 = a5 = a6 = a7 = VEC_ZERO;
66 a8 = a9 = aa = ab = VEC_ZERO;
67 ac = ad = ae = af = VEC_ZERO;
69 for (
j = 0;
j < n;
j++) {
70 register VEC_TYPE wj = VEC_SCAL(w[
j]);
72 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
73 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
74 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
75 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
76 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
77 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
78 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
79 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
80 a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
81 a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
82 aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
83 ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
84 ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
85 ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
86 ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
87 af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
89 STORE(fo + (
i << ldf) + k, a0);
90 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
91 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
92 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
93 STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
94 STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
95 STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
96 STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
97 STORE(fo + (
i << ldf) + 8 * VEC_SIZE + k, a8);
98 STORE(fo + (
i << ldf) + 9 * VEC_SIZE + k, a9);
99 STORE(fo + (
i << ldf) + 10 * VEC_SIZE + k, aa);
100 STORE(fo + (
i << ldf) + 11 * VEC_SIZE + k, ab);
101 STORE(fo + (
i << ldf) + 12 * VEC_SIZE + k, ac);
102 STORE(fo + (
i << ldf) + 13 * VEC_SIZE + k, ad);
103 STORE(fo + (
i << ldf) + 14 * VEC_SIZE + k, ae);
104 STORE(fo + (
i << ldf) + 15 * VEC_SIZE + k, af);
110 for (;
i < (rimap_inv_max[
l] - unroll8 + 1);
i += unroll8) {
112 for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
113 register VEC_TYPE a0, a1, a2, a3;
114 register VEC_TYPE a4, a5, a6, a7;
116 a0 = a1 = a2 = a3 = VEC_ZERO;
117 a4 = a5 = a6 = a7 = VEC_ZERO;
119 for (
j = 0;
j < n;
j++) {
120 register VEC_TYPE wj = VEC_SCAL(w[
j]);
122 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
123 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
124 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
125 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
126 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
127 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
128 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
129 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
131 STORE(fo + (
i << ldf) + k, a0);
132 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
133 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
134 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
135 STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
136 STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
137 STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
138 STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
144 for (;
i < (rimap_inv_max[
l] - unroll4 + 1);
i += unroll4) {
146 for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
147 register VEC_TYPE a0, a1, a2, a3;
149 a0 = a1 = a2 = a3 = VEC_ZERO;
151 for (
j = 0;
j < n;
j++) {
152 register VEC_TYPE wj = VEC_SCAL(w[
j]);
154 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
155 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
156 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
157 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
159 STORE(fo + (
i << ldf) + k, a0);
160 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
161 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
162 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
168 for (;
i < (rimap_inv_max[
l] - unroll2 + 1);
i += unroll2) {
170 for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
171 register VEC_TYPE a0, a1;
175 for (
j = 0;
j < n;
j++) {
176 register VEC_TYPE wj = VEC_SCAL(w[
j]);
178 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
179 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
181 STORE(fo + (
i << ldf) + k, a0);
182 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
188 for (;
i < (rimap_inv_max[
l] - unroll1 + 1);
i += unroll1) {
190 for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
191 register VEC_TYPE a0 = VEC_ZERO;
192 for (
j = 0;
j < n;
j++) {
193 register VEC_TYPE wj = VEC_SCAL(w[
j]);
195 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
197 STORE(fo + (
i << ldf) + k, a0);
203 for (;
i < rimap_inv_max[
l];
i++) {
205 for (k = 0; k < (1 << ldf); k++) {
207 for (
j = 0;
j < n;
j++)
208 a += w[
j] * fi[((
index[
j] +
i) << ldf) + k];
209 fo[(
i << ldf) + k] = a;
218#if defined(ALIGNED) && defined(FENCE)
const int *restrict index