Octopus
operate_inc.c
Go to the documentation of this file.
1/*
2 Copyright (C) 2006 X. Andrade
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 02110-1301, USA.
18
19*/
20
21#ifdef ALIGNED
22#define LOAD VEC_LD
23#define STORE VEC_ST
24#else
25#define LOAD VEC_LDU
26#define STORE VEC_STU
27#endif
28
29{
30 const ptrdiff_t n = opn[0];
31 const ptrdiff_t nri = opnri[0];
32#if DEPTH >= 16
33 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
34#endif
35#if DEPTH >= 8
36 const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
37#endif
38#if DEPTH >= 4
39 const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
40#endif
41#if DEPTH >= 2
42 const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
43#endif
44#if DEPTH >= 1
45 const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
46#endif
47
48 ptrdiff_t l, i, j;
49 const int *restrict index;
50
51 for (l = 0; l < nri; l++) {
52 index = opri + n * l;
53 i = rimap_inv[l];
54
55#if DEPTH >= 16
56 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
57 ptrdiff_t k;
58 for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
59 register VEC_TYPE a0, a1, a2, a3;
60 register VEC_TYPE a4, a5, a6, a7;
61 register VEC_TYPE a8, a9, aa, ab;
62 register VEC_TYPE ac, ad, ae, af;
63
64 a0 = a1 = a2 = a3 = VEC_ZERO;
65 a4 = a5 = a6 = a7 = VEC_ZERO;
66 a8 = a9 = aa = ab = VEC_ZERO;
67 ac = ad = ae = af = VEC_ZERO;
68
69 for (j = 0; j < n; j++) {
70 register VEC_TYPE wj = VEC_SCAL(w[j]);
71 ptrdiff_t indexj = (index[j] + i) << ldf;
72 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
73 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
74 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
75 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
76 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
77 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
78 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
79 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
80 a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
81 a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
82 aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
83 ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
84 ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
85 ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
86 ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
87 af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
88 }
89 STORE(fo + (i << ldf) + k, a0);
90 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
91 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
92 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
93 STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
94 STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
95 STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
96 STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
97 STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
98 STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
99 STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
100 STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
101 STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
102 STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
103 STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
104 STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
105 }
106 }
107#endif
108
109#if DEPTH >= 8
110 for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
111 ptrdiff_t k;
112 for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
113 register VEC_TYPE a0, a1, a2, a3;
114 register VEC_TYPE a4, a5, a6, a7;
115
116 a0 = a1 = a2 = a3 = VEC_ZERO;
117 a4 = a5 = a6 = a7 = VEC_ZERO;
118
119 for (j = 0; j < n; j++) {
120 register VEC_TYPE wj = VEC_SCAL(w[j]);
121 ptrdiff_t indexj = (index[j] + i) << ldf;
122 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
123 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
124 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
125 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
126 a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
127 a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
128 a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
129 a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
130 }
131 STORE(fo + (i << ldf) + k, a0);
132 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
133 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
134 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
135 STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
136 STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
137 STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
138 STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
139 }
140 }
141#endif
142
143#if DEPTH >= 4
144 for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
145 ptrdiff_t k;
146 for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
147 register VEC_TYPE a0, a1, a2, a3;
148
149 a0 = a1 = a2 = a3 = VEC_ZERO;
150
151 for (j = 0; j < n; j++) {
152 register VEC_TYPE wj = VEC_SCAL(w[j]);
153 ptrdiff_t indexj = (index[j] + i) << ldf;
154 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
155 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
156 a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
157 a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
158 }
159 STORE(fo + (i << ldf) + k, a0);
160 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
161 STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
162 STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
163 }
164 }
165#endif
166
167#if DEPTH >= 2
168 for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
169 ptrdiff_t k;
170 for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
171 register VEC_TYPE a0, a1;
172
173 a0 = a1 = VEC_ZERO;
174
175 for (j = 0; j < n; j++) {
176 register VEC_TYPE wj = VEC_SCAL(w[j]);
177 ptrdiff_t indexj = (index[j] + i) << ldf;
178 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
179 a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
180 }
181 STORE(fo + (i << ldf) + k, a0);
182 STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
183 }
184 }
185#endif
186
187#if DEPTH >= 1
188 for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
189 ptrdiff_t k;
190 for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
191 register VEC_TYPE a0 = VEC_ZERO;
192 for (j = 0; j < n; j++) {
193 register VEC_TYPE wj = VEC_SCAL(w[j]);
194 ptrdiff_t indexj = (index[j] + i) << ldf;
195 a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
196 }
197 STORE(fo + (i << ldf) + k, a0);
198 }
199 }
200#endif
201
202#if VEC_SIZE > 1
203 for (; i < rimap_inv_max[l]; i++) {
204 ptrdiff_t k;
205 for (k = 0; k < (1 << ldf); k++) {
206 double a = 0.0;
207 for (j = 0; j < n; j++)
208 a += w[j] * fi[((index[j] + i) << ldf) + k];
209 fo[(i << ldf) + k] = a;
210 }
211 }
212#endif
213
214 } /* l */
215
216 // this fence instruction is needed to ensure correctness when using
217 // non-temporal stores
218#if defined(ALIGNED) && defined(FENCE)
219 FENCE;
220#endif
221}
222
223#undef LOAD
224#undef STORE
long int ptrdiff_t
Definition: operate.c:15
const ptrdiff_t nri
Definition: operate_inc.c:10
ptrdiff_t l
Definition: operate_inc.c:12
ptrdiff_t i
Definition: operate_inc.c:12
ptrdiff_t j
Definition: operate_inc.c:12
const int *restrict index
Definition: operate_inc.c:13