Octopus
vectors.h
Go to the documentation of this file.
1/*
2 Copyright (C) 2010 X. Andrade
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 02110-1301, USA.
18
19*/
20
21#include <config.h>
22
23#ifndef VECTORS_H
24#define VECTORS_H
25
26#ifdef __AVX__
27// Check for intel avx
28
29#if defined(__AVX512F__) || defined(__AVX512PF__) || \
30 defined(__AVX512BW__) || defined(__AVX512ER__) || \
31 defined(__AVX512CD__) || defined(__AVX512DQ__) || \
32 defined(__AVX512VL__)
33// Use AVX512
34
35#include <immintrin.h>
36#define VEC_SIZE 8
37#define VEC_TYPE __m512d
38#define VEC_LD(addr) _mm512_load_pd(addr)
39#define VEC_LDU(addr) _mm512_loadu_pd(addr)
40#define VEC_ST(addr, vec) _mm512_stream_pd(addr, vec)
41#define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec)
42#define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc)
43#define VEC_SCAL(aa) _mm512_set1_pd(aa)
44#define VEC_ADD(aa, bb) _mm512_add_pd(aa, bb)
45#define VEC_SUB(aa, bb) _mm512_sub_pd(aa, bb)
46#define VEC_ZERO _mm512_setzero_pd()
47#include <emmintrin.h>
48#define FENCE _mm_mfence()
49
50/* Use a depth of 16 to avoid register spilling. On skylake CPUs, there are 32
51 * AVX512 registers, so we should only use a maximum of 16 for the points because
52 * we need an additional one for the weights.
53 */
54#define DEPTH 16
55#define VECTORIZATION_LEVEL "AVX512"
56
57#elif defined(__AVX2__)
58// Use AVX2
59
60#include <immintrin.h>
61#if defined(__FMA4__) || defined(__FMA__)
62#include <x86intrin.h>
63#endif
64#define VEC_SIZE 4
65#define VEC_TYPE __m256d
66#define VEC_LD(addr) _mm256_load_pd(addr)
67#define VEC_LDU(addr) _mm256_loadu_pd(addr)
68#define VEC_ST(addr, vec) _mm256_stream_pd(addr, vec)
69#define VEC_STU(addr, vec) _mm256_storeu_pd(addr, vec)
70#ifdef __FMA4__
71#define VEC_FMA(aa, bb, cc) _mm256_macc_pd(aa, bb, cc)
72#elif defined(__FMA__)
73#define VEC_FMA(aa, bb, cc) _mm256_fmadd_pd(aa, bb, cc)
74#else
75#define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb))
76#endif
77#define VEC_SCAL(aa) _mm256_set1_pd(aa)
78#define VEC_ADD(aa, bb) _mm256_add_pd(aa, bb)
79#define VEC_SUB(aa, bb) _mm256_sub_pd(aa, bb)
80#define VEC_ZERO _mm256_setzero_pd()
81#include <emmintrin.h>
82#define FENCE _mm_mfence()
83
84#define DEPTH 16
85#define VECTORIZATION_LEVEL "AVX2"
86
87#else
88// Default to AVX
89
90#include <emmintrin.h>
91#if defined(__FMA4__) || defined(__FMA__)
92#include <x86intrin.h>
93#endif
94#define VEC_SIZE 2
95#define VEC_TYPE __m128d
96#define VEC_LD(addr) _mm_load_pd(addr)
97#define VEC_LDU(addr) _mm_loadu_pd(addr)
98#define VEC_ST(addr, vec) _mm_stream_pd(addr, vec)
99#define VEC_STU(addr, vec) _mm_storeu_pd(addr, vec)
100#ifdef __FMA4__
101#define VEC_FMA(aa, bb, cc) _mm_macc_pd(aa, bb, cc)
102#elif defined(__FMA__)
103#define VEC_FMA(aa, bb, cc) _mm_fmadd_pd(aa, bb, cc)
104#else
105#define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb))
106#endif
107#define VEC_SCAL(aa) _mm_set1_pd(aa)
108#define VEC_ADD(aa, bb) _mm_add_pd(aa, bb)
109#define VEC_SUB(aa, bb) _mm_sub_pd(aa, bb)
110#define VEC_ZERO _mm_setzero_pd()
111#define FENCE _mm_mfence()
112
113#define DEPTH 16
114#define VECTORIZATION_LEVEL "AVX"
115
116#endif
117
118#else
119// Not explicitly optimized
120
121#define VEC_SIZE 1
122#define VEC_TYPE double
123#define VEC_LD(addr) (addr)[0]
124#define VEC_LDU(addr) VEC_LD(addr)
125#define VEC_ST(addr, vec) (addr)[0] = vec
126#define VEC_STU(addr, vec) VEC_ST(addr, vec)
127#define VEC_FMA(aa, bb, cc) ((aa) * (bb) + (cc))
128#define VEC_SCAL(aa) aa
129#define VEC_ADD(aa, bb) ((aa)+(bb))
130#define VEC_SUB(aa, bb) ((aa)-(bb))
131#define VEC_ZERO 0.0
132
133#define DEPTH 8
134#define VECTORIZATION_LEVEL "default"
135#endif
136
137#define max1(x) (((x) > 0) ? (x) : 1)
138
139#endif