Octopus
16.0
real-space, real-time, TDDFT code
vectors.h
Go to the documentation of this file.
1
/*
2
Copyright (C) 2010 X. Andrade
3
4
This program is free software; you can redistribute it and/or modify
5
it under the terms of the GNU General Public License as published by
6
the Free Software Foundation; either version 2, or (at your option)
7
any later version.
8
9
This program is distributed in the hope that it will be useful,
10
but WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
GNU General Public License for more details.
13
14
You should have received a copy of the GNU General Public License
15
along with this program; if not, write to the Free Software
16
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17
02110-1301, USA.
18
19
*/
20
21
#include <
config.h
>
22
23
#ifndef VECTORS_H
24
#define VECTORS_H
25
26
#ifdef __AVX__
27
// Check for intel avx
28
29
#if defined(__AVX512F__) || defined(__AVX512PF__) || \
30
defined(__AVX512BW__) || defined(__AVX512ER__) || \
31
defined(__AVX512CD__) || defined(__AVX512DQ__) || \
32
defined(__AVX512VL__)
33
// Use AVX512
34
35
#include <immintrin.h>
36
#define VEC_SIZE 8
37
#define VEC_TYPE __m512d
38
#define VEC_LD(addr) _mm512_load_pd(addr)
39
#define VEC_LDU(addr) _mm512_loadu_pd(addr)
40
#define VEC_ST(addr, vec) _mm512_stream_pd(addr, vec)
41
#define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec)
42
#define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc)
43
#define VEC_SCAL(aa) _mm512_set1_pd(aa)
44
#define VEC_ZERO _mm512_setzero_pd()
45
#include <emmintrin.h>
46
#define FENCE _mm_mfence()
47
48
/* Use a depth of 16 to avoid register spilling. On skylake CPUs, there are 32
49
* AVX512 registers, so we should only use a maximum of 16 for the points because
50
* we need an additional one for the weights.
51
*/
52
#define DEPTH 16
53
54
#elif defined(__AVX2__)
55
// Use AVX2
56
57
#include <immintrin.h>
58
#if defined(__FMA4__) || defined(__FMA__)
59
#include <x86intrin.h>
60
#endif
61
#define VEC_SIZE 4
62
#define VEC_TYPE __m256d
63
#define VEC_LD(addr) _mm256_load_pd(addr)
64
#define VEC_LDU(addr) _mm256_loadu_pd(addr)
65
#define VEC_ST(addr, vec) _mm256_stream_pd(addr, vec)
66
#define VEC_STU(addr, vec) _mm256_storeu_pd(addr, vec)
67
#ifdef __FMA4__
68
#define VEC_FMA(aa, bb, cc) _mm256_macc_pd(aa, bb, cc)
69
#elif defined(__FMA__)
70
#define VEC_FMA(aa, bb, cc) _mm256_fmadd_pd(aa, bb, cc)
71
#else
72
#define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb))
73
#endif
74
#define VEC_SCAL(aa) _mm256_set1_pd(aa)
75
#define VEC_ZERO _mm256_setzero_pd()
76
#include <emmintrin.h>
77
#define FENCE _mm_mfence()
78
79
#define DEPTH 16
80
81
#else
82
// Default to AVX
83
84
#include <emmintrin.h>
85
#if defined(__FMA4__) || defined(__FMA__)
86
#include <x86intrin.h>
87
#endif
88
#define VEC_SIZE 2
89
#define VEC_TYPE __m128d
90
#define VEC_LD(addr) _mm_load_pd(addr)
91
#define VEC_LDU(addr) _mm_loadu_pd(addr)
92
#define VEC_ST(addr, vec) _mm_stream_pd(addr, vec)
93
#define VEC_STU(addr, vec) _mm_storeu_pd(addr, vec)
94
#ifdef __FMA4__
95
#define VEC_FMA(aa, bb, cc) _mm_macc_pd(aa, bb, cc)
96
#elif defined(__FMA__)
97
#define VEC_FMA(aa, bb, cc) _mm_fmadd_pd(aa, bb, cc)
98
#else
99
#define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb))
100
#endif
101
#define VEC_SCAL(aa) _mm_set1_pd(aa)
102
#define VEC_ZERO _mm_setzero_pd()
103
#define FENCE _mm_mfence()
104
105
#define DEPTH 16
106
#endif
107
108
#elif defined(__bg__)
109
// Check for ibm blue_gene
110
111
#ifdef __bgq__
112
// Check for blue_gene_q
113
114
#define VEC_SIZE 4
115
#define VEC_TYPE vector4double
116
#define VEC_LD(addr) vec_ld(0, (double *)(addr))
117
#define VEC_LDU(addr) \
118
((vector4double){(addr)[0], (addr)[1], (addr)[2], (addr)[3]})
119
#define VEC_ST(addr, vec) vec_st(vec, 0, (double *)(addr))
120
#define VEC_STU(addr, vec) \
121
(addr)[0] = vec_extract(vec, 0); \
122
(addr)[1] = vec_extract(vec, 1); \
123
(addr)[2] = vec_extract(vec, 2); \
124
(addr)[3] = vec_extract(vec, 3)
125
#define VEC_FMA(aa, bb, cc) vec_madd(aa, bb, cc)
126
#define VEC_SCAL(aa) ((vector4double){aa, aa, aa, aa})
127
#define VEC_SCAL_LD(addr) vec_lds(0, (double *)(addr))
128
#define VEC_ZERO ((vector4double){0.0, 0.0, 0.0, 0.0})
129
130
#define DEPTH 16
131
132
#else
133
// Otherwise use default blue_gene
134
135
#define VEC_SIZE 2
136
#define VEC_TYPE double _Complex
137
#define VEC_LD(addr) __lfpd(addr)
138
#define VEC_LDU(addr) __cmplx((addr)[0], (addr)[1])
139
#define VEC_ST(addr, vec) __stfpd(addr, vec)
140
#define VEC_STU(addr, vec) \
141
(addr)[0] = __creal(vec); \
142
(addr)[1] = __cimag(vec)
143
#define VEC_FMA(aa, bb, cc) __fpmadd(cc, aa, bb)
144
#define VEC_SCAL(aa) __cmplx(aa, aa)
145
#define VEC_ZERO __cmplx(0.0, 0.0)
146
147
#define DEPTH 16
148
#endif
149
150
#else
151
// Not explicitly optimized
152
153
#define VEC_SIZE 1
154
#define VEC_TYPE double
155
#define VEC_LD(addr) (addr)[0]
156
#define VEC_LDU(addr) VEC_LD(addr)
157
#define VEC_ST(addr, vec) (addr)[0] = vec
158
#define VEC_STU(addr, vec) VEC_ST(addr, vec)
159
#define VEC_FMA(aa, bb, cc) aa *bb + cc
160
#define VEC_SCAL(aa) aa
161
#define VEC_ZERO 0.0
162
163
#define DEPTH 8
164
#endif
165
166
#define max1(x) (((x) > 0) ? (x) : 1)
167
168
#endif
config.h
src
include
vectors.h
Generated by
1.9.4