|
Octopus
|
Go to the documentation of this file.
29#if defined(__AVX512F__) || defined(__AVX512PF__) || \
30 defined(__AVX512BW__) || defined(__AVX512ER__) || \
31 defined(__AVX512CD__) || defined(__AVX512DQ__) || \
37#define VEC_TYPE __m512d
38#define VEC_LD(addr) _mm512_load_pd(addr)
39#define VEC_LDU(addr) _mm512_loadu_pd(addr)
40#define VEC_ST(addr, vec) _mm512_stream_pd(addr, vec)
41#define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec)
42#define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc)
43#define VEC_SCAL(aa) _mm512_set1_pd(aa)
44#define VEC_ZERO _mm512_setzero_pd()
46#define FENCE _mm_mfence()
54#elif defined(__AVX2__)
58#if defined(__FMA4__) || defined(__FMA__)
62#define VEC_TYPE __m256d
63#define VEC_LD(addr) _mm256_load_pd(addr)
64#define VEC_LDU(addr) _mm256_loadu_pd(addr)
65#define VEC_ST(addr, vec) _mm256_stream_pd(addr, vec)
66#define VEC_STU(addr, vec) _mm256_storeu_pd(addr, vec)
68#define VEC_FMA(aa, bb, cc) _mm256_macc_pd(aa, bb, cc)
70#define VEC_FMA(aa, bb, cc) _mm256_fmadd_pd(aa, bb, cc)
72#define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb))
74#define VEC_SCAL(aa) _mm256_set1_pd(aa)
75#define VEC_ZERO _mm256_setzero_pd()
77#define FENCE _mm_mfence()
85#if defined(__FMA4__) || defined(__FMA__)
89#define VEC_TYPE __m128d
90#define VEC_LD(addr) _mm_load_pd(addr)
91#define VEC_LDU(addr) _mm_loadu_pd(addr)
92#define VEC_ST(addr, vec) _mm_stream_pd(addr, vec)
93#define VEC_STU(addr, vec) _mm_storeu_pd(addr, vec)
95#define VEC_FMA(aa, bb, cc) _mm_macc_pd(aa, bb, cc)
97#define VEC_FMA(aa, bb, cc) _mm_fmadd_pd(aa, bb, cc)
99#define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb))
101#define VEC_SCAL(aa) _mm_set1_pd(aa)
102#define VEC_ZERO _mm_setzero_pd()
103#define FENCE _mm_mfence()
115#define VEC_TYPE vector4double
116#define VEC_LD(addr) vec_ld(0, (double *)(addr))
117#define VEC_LDU(addr) \
118 ((vector4double){(addr)[0], (addr)[1], (addr)[2], (addr)[3]})
119#define VEC_ST(addr, vec) vec_st(vec, 0, (double *)(addr))
120#define VEC_STU(addr, vec) \
121 (addr)[0] = vec_extract(vec, 0); \
122 (addr)[1] = vec_extract(vec, 1); \
123 (addr)[2] = vec_extract(vec, 2); \
124 (addr)[3] = vec_extract(vec, 3)
125#define VEC_FMA(aa, bb, cc) vec_madd(aa, bb, cc)
126#define VEC_SCAL(aa) ((vector4double){aa, aa, aa, aa})
127#define VEC_SCAL_LD(addr) vec_lds(0, (double *)(addr))
128#define VEC_ZERO ((vector4double){0.0, 0.0, 0.0, 0.0})
136#define VEC_TYPE double _Complex
137#define VEC_LD(addr) __lfpd(addr)
138#define VEC_LDU(addr) __cmplx((addr)[0], (addr)[1])
139#define VEC_ST(addr, vec) __stfpd(addr, vec)
140#define VEC_STU(addr, vec) \
141 (addr)[0] = __creal(vec); \
142 (addr)[1] = __cimag(vec)
143#define VEC_FMA(aa, bb, cc) __fpmadd(cc, aa, bb)
144#define VEC_SCAL(aa) __cmplx(aa, aa)
145#define VEC_ZERO __cmplx(0.0, 0.0)
154#define VEC_TYPE double
155#define VEC_LD(addr) (addr)[0]
156#define VEC_LDU(addr) VEC_LD(addr)
157#define VEC_ST(addr, vec) (addr)[0] = vec
158#define VEC_STU(addr, vec) VEC_ST(addr, vec)
159#define VEC_FMA(aa, bb, cc) aa *bb + cc
160#define VEC_SCAL(aa) aa
166#define max1(x) (((x) > 0) ? (x) : 1)