Octopus
operate_sym_inc.c
Go to the documentation of this file.
1/*
2 Copyright (C) 2006 X. Andrade
3 Copyright (C) 2025 N. Tancogne-Dejean
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 02110-1301, USA.
20*/
21
22#ifdef ALIGNED
23#define LOAD VEC_LD
24#define STORE VEC_ST
25#else
26#define LOAD VEC_LDU
27#define STORE VEC_STU
28#endif
29
30{
31 const ptrdiff_t npairs = opn[0];
32 const ptrdiff_t nri = opnri[0];
33#if DEPTH >= 16
34 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
35#endif
36#if DEPTH >= 8
37 const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
38#endif
39#if DEPTH >= 4
40 const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
41#endif
42#if DEPTH >= 2
43 const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
44#endif
45#if DEPTH >= 1
46 const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
47#endif
48
49 ptrdiff_t l, i, j;
50 const int *restrict index_pos;
51 const int *restrict index_neg;
52
53 // Hoisting wcenter
54 const double w0 = wcenter[0];
55
56 for (l = 0; l < nri; l++) {
57 index_pos = opri_pos + npairs * l;
58 index_neg = opri_neg + npairs * l;
59 i = rimap_inv[l];
60
61#if DEPTH >= 16
62 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
63 ptrdiff_t k;
64 for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
65 register VEC_TYPE a0, a1, a2, a3;
66 register VEC_TYPE a4, a5, a6, a7;
67 register VEC_TYPE a8, a9, aa, ab;
68 register VEC_TYPE ac, ad, ae, af;
69
70 a0 = a1 = a2 = a3 = VEC_ZERO;
71 a4 = a5 = a6 = a7 = VEC_ZERO;
72 a8 = a9 = aa = ab = VEC_ZERO;
73 ac = ad = ae = af = VEC_ZERO;
74
75 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
76
77 for (j = 0; j < npairs; j++) {
78 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
79 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
80 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
81 register VEC_TYPE pos = LOAD(fi + indexj_pos);
82 register VEC_TYPE neg = LOAD(fi + indexj_neg);
83 a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
84 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
85 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
86 a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
87 pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
88 neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
89 a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
90 pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
91 neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
92 a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
93 pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
94 neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
95 a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
96 pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
97 neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
98 a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
99 pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
100 neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
101 a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
102 pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
103 neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
104 a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
105 pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
106 neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
107 a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
108 pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
109 neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
110 a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
111 pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
112 neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
113 aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
114 pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
115 neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
116 ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
117 pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
118 neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
119 ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
120 pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
121 neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
122 ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
123 pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
124 neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
125 ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
126 pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
127 neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
128 af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
129
130 }
131 const VEC_TYPE wj = VEC_SCAL(w0);
132 a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
133 a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
134 a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
135 a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
136 a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
137 a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
138 a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
139 a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
140 a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8);
141 a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9);
142 aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa);
143 ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab);
144 ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac);
145 ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad);
146 ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae);
147 af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af);
148
149 STORE(fo + base_ik, a0);
150 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
151 STORE(fo + base_ik + 2 * VEC_SIZE, a2);
152 STORE(fo + base_ik + 3 * VEC_SIZE, a3);
153 STORE(fo + base_ik + 4 * VEC_SIZE, a4);
154 STORE(fo + base_ik + 5 * VEC_SIZE, a5);
155 STORE(fo + base_ik + 6 * VEC_SIZE, a6);
156 STORE(fo + base_ik + 7 * VEC_SIZE, a7);
157 STORE(fo + base_ik + 8 * VEC_SIZE, a8);
158 STORE(fo + base_ik + 9 * VEC_SIZE, a9);
159 STORE(fo + base_ik + 10 * VEC_SIZE, aa);
160 STORE(fo + base_ik + 11 * VEC_SIZE, ab);
161 STORE(fo + base_ik + 12 * VEC_SIZE, ac);
162 STORE(fo + base_ik + 13 * VEC_SIZE, ad);
163 STORE(fo + base_ik + 14 * VEC_SIZE, ae);
164 STORE(fo + base_ik + 15 * VEC_SIZE, af);
165 }
166 }
167#endif
168
169#if DEPTH >= 8
170 for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
171 ptrdiff_t k;
172 for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
173 register VEC_TYPE a0, a1, a2, a3;
174 register VEC_TYPE a4, a5, a6, a7;
175
176 a0 = a1 = a2 = a3 = VEC_ZERO;
177 a4 = a5 = a6 = a7 = VEC_ZERO;
178
179 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
180
181 for (j = 0; j < npairs; j++) {
182 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
183 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
184 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
185 register VEC_TYPE pos = LOAD(fi + indexj_pos);
186 register VEC_TYPE neg = LOAD(fi + indexj_neg);
187 a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
188 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
189 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
190 a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
191 pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
192 neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
193 a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
194 pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
195 neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
196 a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
197 pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
198 neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
199 a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
200 pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
201 neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
202 a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
203 pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
204 neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
205 a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
206 pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
207 neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
208 a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
209 }
210 const VEC_TYPE wj = VEC_SCAL(w0);
211 a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
212 a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
213 a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
214 a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
215 a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
216 a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
217 a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
218 a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
219 STORE(fo + base_ik, a0);
220 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
221 STORE(fo + base_ik + 2 * VEC_SIZE, a2);
222 STORE(fo + base_ik + 3 * VEC_SIZE, a3);
223 STORE(fo + base_ik + 4 * VEC_SIZE, a4);
224 STORE(fo + base_ik + 5 * VEC_SIZE, a5);
225 STORE(fo + base_ik + 6 * VEC_SIZE, a6);
226 STORE(fo + base_ik + 7 * VEC_SIZE, a7);
227 }
228 }
229#endif
230
231#if DEPTH >= 4
232 for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
233 ptrdiff_t k;
234 for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
235 register VEC_TYPE a0, a1, a2, a3;
236
237 a0 = a1 = a2 = a3 = VEC_ZERO;
238
239 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
240
241 for (j = 0; j < npairs; j++) {
242 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
243 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
244 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
245 register VEC_TYPE pos = LOAD(fi + indexj_pos);
246 register VEC_TYPE neg = LOAD(fi + indexj_neg);
247 a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
248 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
249 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
250 a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
251 pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
252 neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
253 a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
254 pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
255 neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
256 a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
257 }
258 const VEC_TYPE wj = VEC_SCAL(w0);
259 a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
260 a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
261 a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
262 a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
263 STORE(fo + base_ik, a0);
264 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
265 STORE(fo + base_ik + 2 * VEC_SIZE, a2);
266 STORE(fo + base_ik + 3 * VEC_SIZE, a3);
267 }
268 }
269#endif
270
271#if DEPTH >= 2
272 for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
273 ptrdiff_t k;
274 for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
275 register VEC_TYPE a0, a1;
276
277 a0 = a1 = VEC_ZERO;
278
279 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
280
281 for (j = 0; j < npairs; j++) {
282 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
283 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
284 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
285 register VEC_TYPE pos = LOAD(fi + indexj_pos);
286 register VEC_TYPE neg = LOAD(fi + indexj_neg);
287 a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
288 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
289 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
290 a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
291 }
292 const VEC_TYPE wj = VEC_SCAL(w0);
293 a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
294 a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
295 STORE(fo + base_ik, a0);
296 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
297 }
298 }
299#endif
300
301#if DEPTH >= 1
302 for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
303 ptrdiff_t k;
304 for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
305 register VEC_TYPE a0 = VEC_ZERO;
306 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
307 for (j = 0; j < npairs; j++) {
308 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
309 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
310 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
311 register VEC_TYPE pos = LOAD(fi + indexj_pos);
312 register VEC_TYPE neg = LOAD(fi + indexj_neg);
313 a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
314 }
315 const VEC_TYPE wj = VEC_SCAL(w0);
316 a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
317 STORE(fo + base_ik, a0);
318 }
319 }
320#endif
321
322#if VEC_SIZE > 1
323
324 const ptrdiff_t size = (ptrdiff_t) 1 << ldf;
325 double a;
326
327 for (; i < rimap_inv_max[l]; i++) {
328 for (ptrdiff_t k = 0; k < size; k++) {
329 a = 0.0;
330 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
331 for (j = 0; j < npairs; j++) {
332 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
333 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
334 const double sum = fi[indexj_pos] + fi[indexj_neg];
335 a += wpair[j] * sum;
336 }
337 fo[base_ik] = a + w0 * fi[base_ik];
338 }
339 }
340#endif
341
342 } /* l */
343
344 // this fence instruction is needed to ensure correctness when using
345 // non-temporal stores
346#if defined(ALIGNED) && defined(FENCE)
347 FENCE;
348#endif
349}
350
351#undef LOAD
352#undef STORE
long int ptrdiff_t
Definition: operate.c:15
const int *restrict index_neg
const ptrdiff_t nri
const double w0
ptrdiff_t l
ptrdiff_t i
ptrdiff_t j
const int *restrict index_pos