33 const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
36 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
39 const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
42 const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
45 const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
48 const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
52 const int *restrict
index;
54 for (
l = 0;
l <
nri;
l++) {
59 for (;
i < (rimap_inv_max[
l] - unroll32 + 1);
i += unroll32) {
61 for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
62 register VEC_TYPE a0, a1, a2, a3;
63 register VEC_TYPE a4, a5, a6, a7;
64 register VEC_TYPE a8, a9, aa, ab;
65 register VEC_TYPE ac, ad, ae, af;
66 register VEC_TYPE b0, b1, b2, b3;
67 register VEC_TYPE b4, b5, b6, b7;
68 register VEC_TYPE b8, b9, ba, bb;
69 register VEC_TYPE bc, bd, be, bf;
71 a0 = a1 = a2 = a3 = VEC_ZERO;
72 a4 = a5 = a6 = a7 = VEC_ZERO;
73 a8 = a9 = aa = ab = VEC_ZERO;
74 ac = ad = ae = af = VEC_ZERO;
75 b0 = b1 = b2 = b3 = VEC_ZERO;
76 b4 = b5 = b6 = b7 = VEC_ZERO;
77 b8 = b9 = ba = bb = VEC_ZERO;
78 bc = bd = be = bf = VEC_ZERO;
80 for (
j = 0;
j < n;
j++) {
82 register VEC_TYPE wj = VEC_SCAL_LD(w +
j);
84 register VEC_TYPE wj = VEC_SCAL(w[
j]);
87 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
88 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
89 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
90 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
91 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
92 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
93 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
94 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
95 a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
96 a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
97 aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
98 ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
99 ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
100 ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
101 ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
102 af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
103 b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0);
104 b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1);
105 b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2);
106 b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3);
107 b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4);
108 b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5);
109 b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6);
110 b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7);
111 b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8);
112 b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9);
113 ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba);
114 bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb);
115 bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc);
116 bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd);
117 be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be);
118 bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf);
120 STORE(fo + (
i << ldf) + k, a0);
121 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
122 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
123 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
124 STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
125 STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
126 STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
127 STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
128 STORE(fo + (
i << ldf) + 8 * VEC_SIZE + k, a8);
129 STORE(fo + (
i << ldf) + 9 * VEC_SIZE + k, a9);
130 STORE(fo + (
i << ldf) + 10 * VEC_SIZE + k, aa);
131 STORE(fo + (
i << ldf) + 11 * VEC_SIZE + k, ab);
132 STORE(fo + (
i << ldf) + 12 * VEC_SIZE + k, ac);
133 STORE(fo + (
i << ldf) + 13 * VEC_SIZE + k, ad);
134 STORE(fo + (
i << ldf) + 14 * VEC_SIZE + k, ae);
135 STORE(fo + (
i << ldf) + 15 * VEC_SIZE + k, af);
136 STORE(fo + (
i << ldf) + 16 * VEC_SIZE + k, b0);
137 STORE(fo + (
i << ldf) + 17 * VEC_SIZE + k, b1);
138 STORE(fo + (
i << ldf) + 18 * VEC_SIZE + k, b2);
139 STORE(fo + (
i << ldf) + 19 * VEC_SIZE + k, b3);
140 STORE(fo + (
i << ldf) + 20 * VEC_SIZE + k, b4);
141 STORE(fo + (
i << ldf) + 21 * VEC_SIZE + k, b5);
142 STORE(fo + (
i << ldf) + 22 * VEC_SIZE + k, b6);
143 STORE(fo + (
i << ldf) + 23 * VEC_SIZE + k, b7);
144 STORE(fo + (
i << ldf) + 24 * VEC_SIZE + k, b8);
145 STORE(fo + (
i << ldf) + 25 * VEC_SIZE + k, b9);
146 STORE(fo + (
i << ldf) + 26 * VEC_SIZE + k, ba);
147 STORE(fo + (
i << ldf) + 27 * VEC_SIZE + k, bb);
148 STORE(fo + (
i << ldf) + 28 * VEC_SIZE + k, bc);
149 STORE(fo + (
i << ldf) + 29 * VEC_SIZE + k, bd);
150 STORE(fo + (
i << ldf) + 30 * VEC_SIZE + k, be);
151 STORE(fo + (
i << ldf) + 31 * VEC_SIZE + k, bf);
157 for (;
i < (rimap_inv_max[
l] - unroll16 + 1);
i += unroll16) {
159 for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
160 register VEC_TYPE a0, a1, a2, a3;
161 register VEC_TYPE a4, a5, a6, a7;
162 register VEC_TYPE a8, a9, aa, ab;
163 register VEC_TYPE ac, ad, ae, af;
165 a0 = a1 = a2 = a3 = VEC_ZERO;
166 a4 = a5 = a6 = a7 = VEC_ZERO;
167 a8 = a9 = aa = ab = VEC_ZERO;
168 ac = ad = ae = af = VEC_ZERO;
170 for (
j = 0;
j < n;
j++) {
172 register VEC_TYPE wj = VEC_SCAL_LD(w +
j);
174 register VEC_TYPE wj = VEC_SCAL(w[
j]);
177 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
178 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
179 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
180 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
181 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
182 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
183 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
184 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
185 a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
186 a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
187 aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
188 ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
189 ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
190 ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
191 ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
192 af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
194 STORE(fo + (
i << ldf) + k, a0);
195 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
196 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
197 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
198 STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
199 STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
200 STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
201 STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
202 STORE(fo + (
i << ldf) + 8 * VEC_SIZE + k, a8);
203 STORE(fo + (
i << ldf) + 9 * VEC_SIZE + k, a9);
204 STORE(fo + (
i << ldf) + 10 * VEC_SIZE + k, aa);
205 STORE(fo + (
i << ldf) + 11 * VEC_SIZE + k, ab);
206 STORE(fo + (
i << ldf) + 12 * VEC_SIZE + k, ac);
207 STORE(fo + (
i << ldf) + 13 * VEC_SIZE + k, ad);
208 STORE(fo + (
i << ldf) + 14 * VEC_SIZE + k, ae);
209 STORE(fo + (
i << ldf) + 15 * VEC_SIZE + k, af);
215 for (;
i < (rimap_inv_max[
l] - unroll8 + 1);
i += unroll8) {
217 for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
218 register VEC_TYPE a0, a1, a2, a3;
219 register VEC_TYPE a4, a5, a6, a7;
221 a0 = a1 = a2 = a3 = VEC_ZERO;
222 a4 = a5 = a6 = a7 = VEC_ZERO;
224 for (
j = 0;
j < n;
j++) {
226 register VEC_TYPE wj = VEC_SCAL_LD(w +
j);
228 register VEC_TYPE wj = VEC_SCAL(w[
j]);
231 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
232 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
233 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
234 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
235 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
236 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
237 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
238 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
240 STORE(fo + (
i << ldf) + k, a0);
241 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
242 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
243 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
244 STORE(fo + (
i << ldf) + 4 * VEC_SIZE + k, a4);
245 STORE(fo + (
i << ldf) + 5 * VEC_SIZE + k, a5);
246 STORE(fo + (
i << ldf) + 6 * VEC_SIZE + k, a6);
247 STORE(fo + (
i << ldf) + 7 * VEC_SIZE + k, a7);
253 for (;
i < (rimap_inv_max[
l] - unroll4 + 1);
i += unroll4) {
255 for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
256 register VEC_TYPE a0, a1, a2, a3;
258 a0 = a1 = a2 = a3 = VEC_ZERO;
260 for (
j = 0;
j < n;
j++) {
262 register VEC_TYPE wj = VEC_SCAL_LD(w +
j);
264 register VEC_TYPE wj = VEC_SCAL(w[
j]);
267 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
268 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
269 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
270 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
272 STORE(fo + (
i << ldf) + k, a0);
273 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
274 STORE(fo + (
i << ldf) + 2 * VEC_SIZE + k, a2);
275 STORE(fo + (
i << ldf) + 3 * VEC_SIZE + k, a3);
281 for (;
i < (rimap_inv_max[
l] - unroll2 + 1);
i += unroll2) {
283 for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
284 register VEC_TYPE a0, a1;
288 for (
j = 0;
j < n;
j++) {
290 register VEC_TYPE wj = VEC_SCAL_LD(w +
j);
292 register VEC_TYPE wj = VEC_SCAL(w[
j]);
295 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
296 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
298 STORE(fo + (
i << ldf) + k, a0);
299 STORE(fo + (
i << ldf) + 1 * VEC_SIZE + k, a1);
305 for (;
i < (rimap_inv_max[
l] - unroll1 + 1);
i += unroll1) {
307 for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
308 register VEC_TYPE a0 = VEC_ZERO;
309 for (
j = 0;
j < n;
j++) {
311 register VEC_TYPE wj = VEC_SCAL_LD(w +
j);
313 register VEC_TYPE wj = VEC_SCAL(w[
j]);
316 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
318 STORE(fo + (
i << ldf) + k, a0);
324 for (;
i < rimap_inv_max[
l];
i++) {
326 for (k = 0; k < (1 << ldf); k++) {
328 for (
j = 0;
j < n;
j++)
329 a += w[
j] * fi[((
index[
j] +
i) << ldf) + k];
330 fo[(
i << ldf) + k] = a;
339#if defined(ALIGNED) && defined(FENCE)
const int *restrict index