Octopus
operate_antisym_inc.c
Go to the documentation of this file.
1/*
2 Copyright (C) 2006 X. Andrade
3 Copyright (C) 2025 N. Tancogne-Dejean
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 02110-1301, USA.
19
20*/
21
22#ifdef ALIGNED
23#define LOAD VEC_LD
24#define STORE VEC_ST
25#else
26#define LOAD VEC_LDU
27#define STORE VEC_STU
28#endif
29
30{
31 const ptrdiff_t npairs = opn[0];
32 const ptrdiff_t nri = opnri[0];
33#if DEPTH >= 16
34 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
35#endif
36#if DEPTH >= 8
37 const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
38#endif
39#if DEPTH >= 4
40 const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
41#endif
42#if DEPTH >= 2
43 const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
44#endif
45#if DEPTH >= 1
46 const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
47#endif
48
49 ptrdiff_t l, i, j;
50 const int *restrict index_pos;
51 const int *restrict index_neg;
52
53 for (l = 0; l < nri; l++) {
54 index_pos = opri_pos + npairs * l;
55 index_neg = opri_neg + npairs * l;
56 i = rimap_inv[l];
57
58#if DEPTH >= 16
59 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
60 ptrdiff_t k;
61 for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
62 register VEC_TYPE a0, a1, a2, a3;
63 register VEC_TYPE a4, a5, a6, a7;
64 register VEC_TYPE a8, a9, aa, ab;
65 register VEC_TYPE ac, ad, ae, af;
66
67 a0 = a1 = a2 = a3 = VEC_ZERO;
68 a4 = a5 = a6 = a7 = VEC_ZERO;
69 a8 = a9 = aa = ab = VEC_ZERO;
70 ac = ad = ae = af = VEC_ZERO;
71
72 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
73
74 for (j = 0; j < npairs; j++) {
75 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
76 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
77 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
78 register VEC_TYPE pos = LOAD(fi + indexj_pos);
79 register VEC_TYPE neg = LOAD(fi + indexj_neg);
80 a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
81 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
82 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
83 a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
84 pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
85 neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
86 a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
87 pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
88 neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
89 a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
90 pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
91 neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
92 a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
93 pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
94 neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
95 a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
96 pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
97 neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
98 a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
99 pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
100 neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
101 a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
102 pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
103 neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
104 a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);
105 pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
106 neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
107 a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);
108 pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
109 neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
110 aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);
111 pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
112 neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
113 ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);
114 pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
115 neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
116 ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);
117 pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
118 neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
119 ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);
120 pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
121 neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
122 ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);
123 pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
124 neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
125 af = VEC_FMA(wj, VEC_SUB(pos, neg), af);
126
127 }
128 STORE(fo + base_ik, a0);
129 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
130 STORE(fo + base_ik + 2 * VEC_SIZE, a2);
131 STORE(fo + base_ik + 3 * VEC_SIZE, a3);
132 STORE(fo + base_ik + 4 * VEC_SIZE, a4);
133 STORE(fo + base_ik + 5 * VEC_SIZE, a5);
134 STORE(fo + base_ik + 6 * VEC_SIZE, a6);
135 STORE(fo + base_ik + 7 * VEC_SIZE, a7);
136 STORE(fo + base_ik + 8 * VEC_SIZE, a8);
137 STORE(fo + base_ik + 9 * VEC_SIZE, a9);
138 STORE(fo + base_ik + 10 * VEC_SIZE, aa);
139 STORE(fo + base_ik + 11 * VEC_SIZE, ab);
140 STORE(fo + base_ik + 12 * VEC_SIZE, ac);
141 STORE(fo + base_ik + 13 * VEC_SIZE, ad);
142 STORE(fo + base_ik + 14 * VEC_SIZE, ae);
143 STORE(fo + base_ik + 15 * VEC_SIZE, af);
144 }
145 }
146#endif
147
148#if DEPTH >= 8
149 for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
150 ptrdiff_t k;
151 for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
152 register VEC_TYPE a0, a1, a2, a3;
153 register VEC_TYPE a4, a5, a6, a7;
154
155 a0 = a1 = a2 = a3 = VEC_ZERO;
156 a4 = a5 = a6 = a7 = VEC_ZERO;
157
158 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
159
160 for (j = 0; j < npairs; j++) {
161 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
162 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
163 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
164 register VEC_TYPE pos = LOAD(fi + indexj_pos);
165 register VEC_TYPE neg = LOAD(fi + indexj_neg);
166 a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
167 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
168 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
169 a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
170 pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
171 neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
172 a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
173 pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
174 neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
175 a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
176 pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
177 neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
178 a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
179 pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
180 neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
181 a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
182 pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
183 neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
184 a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
185 pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
186 neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
187 a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
188 }
189 STORE(fo + base_ik, a0);
190 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
191 STORE(fo + base_ik + 2 * VEC_SIZE, a2);
192 STORE(fo + base_ik + 3 * VEC_SIZE, a3);
193 STORE(fo + base_ik + 4 * VEC_SIZE, a4);
194 STORE(fo + base_ik + 5 * VEC_SIZE, a5);
195 STORE(fo + base_ik + 6 * VEC_SIZE, a6);
196 STORE(fo + base_ik + 7 * VEC_SIZE, a7);
197 }
198 }
199#endif
200
201#if DEPTH >= 4
202 for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
203 ptrdiff_t k;
204 for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
205 register VEC_TYPE a0, a1, a2, a3;
206
207 a0 = a1 = a2 = a3 = VEC_ZERO;
208
209 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
210
211 for (j = 0; j < npairs; j++) {
212 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
213 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
214 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
215 register VEC_TYPE pos = LOAD(fi + indexj_pos);
216 register VEC_TYPE neg = LOAD(fi + indexj_neg);
217 a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
218 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
219 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
220 a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
221 pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
222 neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
223 a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
224 pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
225 neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
226 a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
227 }
228 STORE(fo + base_ik, a0);
229 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
230 STORE(fo + base_ik + 2 * VEC_SIZE, a2);
231 STORE(fo + base_ik + 3 * VEC_SIZE, a3);
232 }
233 }
234#endif
235
236#if DEPTH >= 2
237 for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
238 ptrdiff_t k;
239 for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
240 register VEC_TYPE a0, a1;
241
242 a0 = a1 = VEC_ZERO;
243
244 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
245
246 for (j = 0; j < npairs; j++) {
247 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
248 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
249 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
250 register VEC_TYPE pos = LOAD(fi + indexj_pos);
251 register VEC_TYPE neg = LOAD(fi + indexj_neg);
252 a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
253 pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
254 neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
255 a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
256 }
257 STORE(fo + base_ik, a0);
258 STORE(fo + base_ik + 1 * VEC_SIZE, a1);
259 }
260 }
261#endif
262
263#if DEPTH >= 1
264 for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
265 ptrdiff_t k;
266 for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
267 register VEC_TYPE a0 = VEC_ZERO;
268 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
269 for (j = 0; j < npairs; j++) {
270 register VEC_TYPE wj = VEC_SCAL(wpair[j]);
271 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
272 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
273 register VEC_TYPE pos = LOAD(fi + indexj_pos);
274 register VEC_TYPE neg = LOAD(fi + indexj_neg);
275 a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
276 }
277 STORE(fo + base_ik, a0);
278 }
279 }
280#endif
281
282#if VEC_SIZE > 1
283
284 const ptrdiff_t size = (ptrdiff_t) 1 << ldf;
285 double a;
286
287 for (; i < rimap_inv_max[l]; i++) {
288 for (ptrdiff_t k = 0; k < size; k++) {
289 a = 0.0;
290 const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k;
291 for (j = 0; j < npairs; j++) {
292 const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
293 const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
294 const double diff = fi[indexj_pos] - fi[indexj_neg];
295 a += wpair[j] * diff;
296 }
297 fo[base_ik] = a;
298 }
299 }
300#endif
301
302 } /* l */
303
304 // this fence instruction is needed to ensure correctness when using
305 // non-temporal stores
306#if defined(ALIGNED) && defined(FENCE)
307 FENCE;
308#endif
309}
310
311#undef LOAD
312#undef STORE
long int ptrdiff_t
Definition: operate.c:15
const int *restrict index_neg
const ptrdiff_t nri
ptrdiff_t l
ptrdiff_t i
ptrdiff_t j
const int *restrict index_pos