Octopus
vectors.h
Go to the documentation of this file.
1/*
2 Copyright (C) 2010 X. Andrade
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 02110-1301, USA.
18
19*/
20
21#include <config.h>
22
23#ifndef VECTORS_H
24#define VECTORS_H
25
26#ifdef __AVX__
27// Check for intel avx
28
29#if defined(__AVX512F__) || defined(__AVX512PF__) || \
30 defined(__AVX512BW__) || defined(__AVX512ER__) || \
31 defined(__AVX512CD__) || defined(__AVX512DQ__) || \
32 defined(__AVX512VL__)
33// Use AVX512
34
35#include <immintrin.h>
36#define VEC_SIZE 8
37#define VEC_TYPE __m512d
38#define VEC_LD(addr) _mm512_load_pd(addr)
39#define VEC_LDU(addr) _mm512_loadu_pd(addr)
40#define VEC_ST(addr, vec) _mm512_stream_pd(addr, vec)
41#define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec)
42#define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc)
43#define VEC_SCAL(aa) _mm512_set1_pd(aa)
44#define VEC_ZERO _mm512_setzero_pd()
45#include <emmintrin.h>
46#define FENCE _mm_mfence()
47
48#define DEPTH 16
49
50#elif defined(__AVX2__)
51// Use AVX2
52
53#include <immintrin.h>
54#if defined(__FMA4__) || defined(__FMA__)
55#include <x86intrin.h>
56#endif
57#define VEC_SIZE 4
58#define VEC_TYPE __m256d
59#define VEC_LD(addr) _mm256_load_pd(addr)
60#define VEC_LDU(addr) _mm256_loadu_pd(addr)
61#define VEC_ST(addr, vec) _mm256_stream_pd(addr, vec)
62#define VEC_STU(addr, vec) _mm256_storeu_pd(addr, vec)
63#ifdef __FMA4__
64#define VEC_FMA(aa, bb, cc) _mm256_macc_pd(aa, bb, cc)
65#elif defined(__FMA__)
66#define VEC_FMA(aa, bb, cc) _mm256_fmadd_pd(aa, bb, cc)
67#else
68#define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb))
69#endif
70#define VEC_SCAL(aa) _mm256_set1_pd(aa)
71#define VEC_ZERO _mm256_setzero_pd()
72#include <emmintrin.h>
73#define FENCE _mm_mfence()
74
75#define DEPTH 16
76
77#else
78// Default to AVX
79
80#include <emmintrin.h>
81#if defined(__FMA4__) || defined(__FMA__)
82#include <x86intrin.h>
83#endif
84#define VEC_SIZE 2
85#define VEC_TYPE __m128d
86#define VEC_LD(addr) _mm_load_pd(addr)
87#define VEC_LDU(addr) _mm_loadu_pd(addr)
88#define VEC_ST(addr, vec) _mm_stream_pd(addr, vec)
89#define VEC_STU(addr, vec) _mm_storeu_pd(addr, vec)
90#ifdef __FMA4__
91#define VEC_FMA(aa, bb, cc) _mm_macc_pd(aa, bb, cc)
92#elif defined(__FMA__)
93#define VEC_FMA(aa, bb, cc) _mm_fmadd_pd(aa, bb, cc)
94#else
95#define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb))
96#endif
97#define VEC_SCAL(aa) _mm_set1_pd(aa)
98#define VEC_ZERO _mm_setzero_pd()
99#define FENCE _mm_mfence()
100
101#define DEPTH 16
102#endif
103
104#elif defined(__bg__)
105// Check for ibm blue_gene
106
107#ifdef __bgq__
108// Check for blue_gene_q
109
110#define VEC_SIZE 4
111#define VEC_TYPE vector4double
112#define VEC_LD(addr) vec_ld(0, (double *)(addr))
113#define VEC_LDU(addr) \
114 ((vector4double){(addr)[0], (addr)[1], (addr)[2], (addr)[3]})
115#define VEC_ST(addr, vec) vec_st(vec, 0, (double *)(addr))
116#define VEC_STU(addr, vec) \
117 (addr)[0] = vec_extract(vec, 0); \
118 (addr)[1] = vec_extract(vec, 1); \
119 (addr)[2] = vec_extract(vec, 2); \
120 (addr)[3] = vec_extract(vec, 3)
121#define VEC_FMA(aa, bb, cc) vec_madd(aa, bb, cc)
122#define VEC_SCAL(aa) ((vector4double){aa, aa, aa, aa})
123#define VEC_SCAL_LD(addr) vec_lds(0, (double *)(addr))
124#define VEC_ZERO ((vector4double){0.0, 0.0, 0.0, 0.0})
125
126#define DEPTH 16
127
128#else
129// Otherwise use default blue_gene
130
131#define VEC_SIZE 2
132#define VEC_TYPE double _Complex
133#define VEC_LD(addr) __lfpd(addr)
134#define VEC_LDU(addr) __cmplx((addr)[0], (addr)[1])
135#define VEC_ST(addr, vec) __stfpd(addr, vec)
136#define VEC_STU(addr, vec) \
137 (addr)[0] = __creal(vec); \
138 (addr)[1] = __cimag(vec)
139#define VEC_FMA(aa, bb, cc) __fpmadd(cc, aa, bb)
140#define VEC_SCAL(aa) __cmplx(aa, aa)
141#define VEC_ZERO __cmplx(0.0, 0.0)
142
143#define DEPTH 16
144#endif
145
146#else
147// Not explicitly optimized
148
149#define VEC_SIZE 1
150#define VEC_TYPE double
151#define VEC_LD(addr) (addr)[0]
152#define VEC_LDU(addr) VEC_LD(addr)
153#define VEC_ST(addr, vec) (addr)[0] = vec
154#define VEC_STU(addr, vec) VEC_ST(addr, vec)
155#define VEC_FMA(aa, bb, cc) aa *bb + cc
156#define VEC_SCAL(aa) aa
157#define VEC_ZERO 0.0
158
159#define DEPTH 8
160#endif
161
162#define max1(x) (((x) > 0) ? (x) : 1)
163
164#endif