Octopus
operate_inc.c
Go to the documentation of this file.
1/*
2 Copyright (C) 2006 X. Andrade
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 02110-1301, USA.
18
19*/
20
21#ifdef ALIGNED
22#define LOAD VEC_LD
23#define STORE VEC_ST
24#else
25#define LOAD VEC_LDU
26#define STORE VEC_STU
27#endif
28
29{
30 const ptrdiff_t n = opn[0];
31 const ptrdiff_t nri = opnri[0];
32#if DEPTH >= 32
33 const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
34#endif
35#if DEPTH >= 16
36 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
37#endif
38#if DEPTH >= 8
39 const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
40#endif
41#if DEPTH >= 4
42 const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
43#endif
44#if DEPTH >= 2
45 const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
46#endif
47#if DEPTH >= 1
48 const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
49#endif
50
51 ptrdiff_t l, i, j;
52 const int *restrict index;
53
54 for (l = 0; l < nri; l++) {
55 index = opri + n * l;
56 i = rimap_inv[l];
57
58#if DEPTH >= 32
59 for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
60 ptrdiff_t k;
61 for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
62 register VEC_TYPE a0, a1, a2, a3;
63 register VEC_TYPE a4, a5, a6, a7;
64 register VEC_TYPE a8, a9, aa, ab;
65 register VEC_TYPE ac, ad, ae, af;
66 register VEC_TYPE b0, b1, b2, b3;
67 register VEC_TYPE b4, b5, b6, b7;
68 register VEC_TYPE b8, b9, ba, bb;
69 register VEC_TYPE bc, bd, be, bf;
70
71 a0 = a1 = a2 = a3 = VEC_ZERO;
72 a4 = a5 = a6 = a7 = VEC_ZERO;
73 a8 = a9 = aa = ab = VEC_ZERO;
74 ac = ad = ae = af = VEC_ZERO;
75 b0 = b1 = b2 = b3 = VEC_ZERO;
76 b4 = b5 = b6 = b7 = VEC_ZERO;
77 b8 = b9 = ba = bb = VEC_ZERO;
78 bc = bd = be = bf = VEC_ZERO;
79
80 for (j = 0; j < n; j++) {
81#ifdef VEC_SCAL_LD
82 register VEC_TYPE wj = VEC_SCAL_LD(w + j);
83#else
84 register VEC_TYPE wj = VEC_SCAL(w[j]);
85#endif
86 ptrdiff_t indexj = (index[j] + i) << ldf;
87 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
88 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
89 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
90 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
91 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
92 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
93 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
94 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
95 a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
96 a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
97 aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
98 ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
99 ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
100 ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
101 ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
102 af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
103 b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0);
104 b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1);
105 b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2);
106 b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3);
107 b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4);
108 b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5);
109 b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6);
110 b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7);
111 b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8);
112 b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9);
113 ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba);
114 bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb);
115 bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc);
116 bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd);
117 be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be);
118 bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf);
119 }
120 STORE(fo + (i << ldf) + k, a0);
121 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
122 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
123 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
124 STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
125 STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
126 STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
127 STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
128 STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
129 STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
130 STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
131 STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
132 STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
133 STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
134 STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
135 STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
136 STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
137 STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
138 STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
139 STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
140 STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
141 STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
142 STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
143 STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
144 STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
145 STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
146 STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
147 STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
148 STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
149 STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
150 STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
151 STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
152 }
153 }
154#endif
155
156#if DEPTH >= 16
157 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
158 ptrdiff_t k;
159 for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
160 register VEC_TYPE a0, a1, a2, a3;
161 register VEC_TYPE a4, a5, a6, a7;
162 register VEC_TYPE a8, a9, aa, ab;
163 register VEC_TYPE ac, ad, ae, af;
164
165 a0 = a1 = a2 = a3 = VEC_ZERO;
166 a4 = a5 = a6 = a7 = VEC_ZERO;
167 a8 = a9 = aa = ab = VEC_ZERO;
168 ac = ad = ae = af = VEC_ZERO;
169
170 for (j = 0; j < n; j++) {
171#ifdef VEC_SCAL_LD
172 register VEC_TYPE wj = VEC_SCAL_LD(w + j);
173#else
174 register VEC_TYPE wj = VEC_SCAL(w[j]);
175#endif
176 ptrdiff_t indexj = (index[j] + i) << ldf;
177 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
178 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
179 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
180 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
181 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
182 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
183 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
184 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
185 a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
186 a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
187 aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
188 ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
189 ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
190 ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
191 ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
192 af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
193 }
194 STORE(fo + (i << ldf) + k, a0);
195 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
196 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
197 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
198 STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
199 STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
200 STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
201 STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
202 STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
203 STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
204 STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
205 STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
206 STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
207 STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
208 STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
209 STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
210 }
211 }
212#endif
213
214#if DEPTH >= 8
215 for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
216 ptrdiff_t k;
217 for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
218 register VEC_TYPE a0, a1, a2, a3;
219 register VEC_TYPE a4, a5, a6, a7;
220
221 a0 = a1 = a2 = a3 = VEC_ZERO;
222 a4 = a5 = a6 = a7 = VEC_ZERO;
223
224 for (j = 0; j < n; j++) {
225#ifdef VEC_SCAL_LD
226 register VEC_TYPE wj = VEC_SCAL_LD(w + j);
227#else
228 register VEC_TYPE wj = VEC_SCAL(w[j]);
229#endif
230 ptrdiff_t indexj = (index[j] + i) << ldf;
231 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
232 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
233 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
234 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
235 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
236 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
237 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
238 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
239 }
240 STORE(fo + (i << ldf) + k, a0);
241 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
242 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
243 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
244 STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
245 STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
246 STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
247 STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
248 }
249 }
250#endif
251
252#if DEPTH >= 4
253 for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
254 ptrdiff_t k;
255 for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
256 register VEC_TYPE a0, a1, a2, a3;
257
258 a0 = a1 = a2 = a3 = VEC_ZERO;
259
260 for (j = 0; j < n; j++) {
261#ifdef VEC_SCAL_LD
262 register VEC_TYPE wj = VEC_SCAL_LD(w + j);
263#else
264 register VEC_TYPE wj = VEC_SCAL(w[j]);
265#endif
266 ptrdiff_t indexj = (index[j] + i) << ldf;
267 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
268 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
269 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
270 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
271 }
272 STORE(fo + (i << ldf) + k, a0);
273 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
274 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
275 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
276 }
277 }
278#endif
279
280#if DEPTH >= 2
281 for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
282 ptrdiff_t k;
283 for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
284 register VEC_TYPE a0, a1;
285
286 a0 = a1 = VEC_ZERO;
287
288 for (j = 0; j < n; j++) {
289#ifdef VEC_SCAL_LD
290 register VEC_TYPE wj = VEC_SCAL_LD(w + j);
291#else
292 register VEC_TYPE wj = VEC_SCAL(w[j]);
293#endif
294 ptrdiff_t indexj = (index[j] + i) << ldf;
295 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
296 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
297 }
298 STORE(fo + (i << ldf) + k, a0);
299 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
300 }
301 }
302#endif
303
304#if DEPTH >= 1
305 for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
306 ptrdiff_t k;
307 for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
308 register VEC_TYPE a0 = VEC_ZERO;
309 for (j = 0; j < n; j++) {
310#ifdef VEC_SCAL_LD
311 register VEC_TYPE wj = VEC_SCAL_LD(w + j);
312#else
313 register VEC_TYPE wj = VEC_SCAL(w[j]);
314#endif
315 ptrdiff_t indexj = (index[j] + i) << ldf;
316 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
317 }
318 STORE(fo + (i << ldf) + k, a0);
319 }
320 }
321#endif
322
323#if VEC_SIZE > 1
324 for (; i < rimap_inv_max[l]; i++) {
325 ptrdiff_t k;
326 for (k = 0; k < (1 << ldf); k++) {
327 double a = 0.0;
328 for (j = 0; j < n; j++)
329 a += w[j] * fi[((index[j] + i) << ldf) + k];
330 fo[(i << ldf) + k] = a;
331 }
332 }
333#endif
334
335 } /* l */
336
337 // this fence instruction is needed to ensure correctness when using
338 // non-temporal stores
339#if defined(ALIGNED) && defined(FENCE)
340 FENCE;
341#endif
342}
343
344#undef LOAD
345#undef STORE
long int ptrdiff_t
Definition: operate.c:15
const ptrdiff_t nri
Definition: operate_inc.c:10
ptrdiff_t l
Definition: operate_inc.c:12
ptrdiff_t i
Definition: operate_inc.c:12
ptrdiff_t j
Definition: operate_inc.c:12
const int *restrict index
Definition: operate_inc.c:13