LevelS SHT library
3.50
libsharp
sharp_vecsupport.h
1
/*
2
* This file is part of libsharp.
3
*
4
* libsharp is free software; you can redistribute it and/or modify
5
* it under the terms of the GNU General Public License as published by
6
* the Free Software Foundation; either version 2 of the License, or
7
* (at your option) any later version.
8
*
9
* libsharp is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
* GNU General Public License for more details.
13
*
14
* You should have received a copy of the GNU General Public License
15
* along with libsharp; if not, write to the Free Software
16
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
*/
18
19
/*
20
* libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
21
* and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
22
* (DLR).
23
*/
24
25
/* \file sharp_vecsupport.h
26
* Convenience functions for vector arithmetics
27
*
28
* Copyright (C) 2012-2016 Max-Planck-Society
29
* Author: Martin Reinecke
30
*/
31
32
#ifndef SHARP_VECSUPPORT_H
33
#define SHARP_VECSUPPORT_H
34
35
#include <math.h>
36
#include "
sharp_vecutil.h
"
37
38
typedef
double
Ts;
39
40
#if (VLEN==1)
41
42
typedef
double
Tv;
43
typedef
float
Tv_s;
44
typedef
int
Tm;
45
46
#define vadd(a,b) ((a)+(b))
47
#define vadd_s(a,b) ((a)+(b))
48
#define vaddeq(a,b) ((a)+=(b))
49
#define vaddeq_mask(mask,a,b) if (mask) (a)+=(b);
50
#define vsub(a,b) ((a)-(b))
51
#define vsub_s(a,b) ((a)-(b))
52
#define vsubeq(a,b) ((a)-=(b))
53
#define vsubeq_mask(mask,a,b) if (mask) (a)-=(b);
54
#define vmul(a,b) ((a)*(b))
55
#define vmul_s(a,b) ((a)*(b))
56
#define vmuleq(a,b) ((a)*=(b))
57
#define vmuleq_mask(mask,a,b) if (mask) (a)*=(b);
58
#define vfmaeq(a,b,c) ((a)+=(b)*(c))
59
#define vfmaeq_s(a,b,c) ((a)+=(b)*(c))
60
#define vfmseq(a,b,c) ((a)-=(b)*(c))
61
#define vfmaaeq(a,b,c,d,e) ((a)+=(b)*(c)+(d)*(e))
62
#define vfmaseq(a,b,c,d,e) ((a)+=(b)*(c)-(d)*(e))
63
#define vneg(a) (-(a))
64
#define vload(a) (a)
65
#define vload_s(a) (a)
66
#define vloadu(p) (*(p))
67
#define vloadu_s(p) (*(p))
68
#define vabs(a) fabs(a)
69
#define vsqrt(a) sqrt(a)
70
#define vlt(a,b) ((a)<(b))
71
#define vgt(a,b) ((a)>(b))
72
#define vge(a,b) ((a)>=(b))
73
#define vne(a,b) ((a)!=(b))
74
#define vand_mask(a,b) ((a)&&(b))
75
#define vor_mask(a,b) ((a)||(b))
76
#define vstoreu(p, a) (*(p)=a)
77
#define vstoreu_s(p, a) (*(p)=a)
78
79
static
inline
Tv vmin (Tv a, Tv b) {
return
(a<b) ? a : b; }
80
static
inline
Tv vmax (Tv a, Tv b) {
return
(a>b) ? a : b; }
81
82
#define vanyTrue(a) (a)
83
#define vallTrue(a) (a)
84
#define vzero 0.
85
#define vone 1.
86
87
#endif
88
89
#if (VLEN==2)
90
91
#include <emmintrin.h>
92
93
#if defined (__SSE3__)
94
#include <pmmintrin.h>
95
#endif
96
#if defined (__SSE4_1__)
97
#include <smmintrin.h>
98
#endif
99
100
typedef
__m128d Tv;
101
typedef
__m128 Tv_s;
102
typedef
__m128d Tm;
103
104
#if defined(__SSE4_1__)
105
#define vblend__(m,a,b) _mm_blendv_pd(b,a,m)
106
#else
107
static
inline
Tv vblend__(Tv m, Tv a, Tv b)
108
{
return
_mm_or_pd(_mm_and_pd(a,m),_mm_andnot_pd(m,b)); }
109
#endif
110
#define vzero _mm_setzero_pd()
111
#define vone _mm_set1_pd(1.)
112
113
#define vadd(a,b) _mm_add_pd(a,b)
114
#define vadd_s(a,b) _mm_add_ps(a,b)
115
#define vaddeq(a,b) a=_mm_add_pd(a,b)
116
#define vaddeq_mask(mask,a,b) a=_mm_add_pd(a,vblend__(mask,b,vzero))
117
#define vsub(a,b) _mm_sub_pd(a,b)
118
#define vsub_s(a,b) _mm_sub_ps(a,b)
119
#define vsubeq(a,b) a=_mm_sub_pd(a,b)
120
#define vsubeq_mask(mask,a,b) a=_mm_sub_pd(a,vblend__(mask,b,vzero))
121
#define vmul(a,b) _mm_mul_pd(a,b)
122
#define vmul_s(a,b) _mm_mul_ps(a,b)
123
#define vmuleq(a,b) a=_mm_mul_pd(a,b)
124
#define vmuleq_mask(mask,a,b) a=_mm_mul_pd(a,vblend__(mask,b,vone))
125
#define vfmaeq(a,b,c) a=_mm_add_pd(a,_mm_mul_pd(b,c))
126
#define vfmaeq_s(a,b,c) a=_mm_add_ps(a,_mm_mul_ps(b,c))
127
#define vfmseq(a,b,c) a=_mm_sub_pd(a,_mm_mul_pd(b,c))
128
#define vfmaaeq(a,b,c,d,e) \
129
a=_mm_add_pd(a,_mm_add_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
130
#define vfmaseq(a,b,c,d,e) \
131
a=_mm_add_pd(a,_mm_sub_pd(_mm_mul_pd(b,c),_mm_mul_pd(d,e)))
132
#define vneg(a) _mm_xor_pd(_mm_set1_pd(-0.),a)
133
#define vload(a) _mm_set1_pd(a)
134
#define vload_s(a) _mm_set1_ps(a)
135
#define vabs(a) _mm_andnot_pd(_mm_set1_pd(-0.),a)
136
#define vsqrt(a) _mm_sqrt_pd(a)
137
#define vlt(a,b) _mm_cmplt_pd(a,b)
138
#define vgt(a,b) _mm_cmpgt_pd(a,b)
139
#define vge(a,b) _mm_cmpge_pd(a,b)
140
#define vne(a,b) _mm_cmpneq_pd(a,b)
141
#define vand_mask(a,b) _mm_and_pd(a,b)
142
#define vor_mask(a,b) _mm_or_pd(a,b)
143
#define vmin(a,b) _mm_min_pd(a,b)
144
#define vmax(a,b) _mm_max_pd(a,b);
145
#define vanyTrue(a) (_mm_movemask_pd(a)!=0)
146
#define vallTrue(a) (_mm_movemask_pd(a)==3)
147
#define vloadu(p) _mm_loadu_pd(p)
148
#define vloadu_s(p) _mm_loadu_ps(p)
149
#define vstoreu(p, v) _mm_storeu_pd(p, v)
150
#define vstoreu_s(p, v) _mm_storeu_ps(p, v)
151
152
#endif
153
154
#if (VLEN==4)
155
156
#include <immintrin.h>
157
#if (USE_FMA4)
158
#include <x86intrin.h>
159
#endif
160
161
typedef
__m256d Tv;
162
typedef
__m256 Tv_s;
163
typedef
__m256d Tm;
164
165
#define vblend__(m,a,b) _mm256_blendv_pd(b,a,m)
166
#define vzero _mm256_setzero_pd()
167
#define vone _mm256_set1_pd(1.)
168
169
#define vadd(a,b) _mm256_add_pd(a,b)
170
#define vadd_s(a,b) _mm256_add_ps(a,b)
171
#define vaddeq(a,b) a=_mm256_add_pd(a,b)
172
#define vaddeq_mask(mask,a,b) a=_mm256_add_pd(a,vblend__(mask,b,vzero))
173
#define vsub(a,b) _mm256_sub_pd(a,b)
174
#define vsub_s(a,b) _mm256_sub_ps(a,b)
175
#define vsubeq(a,b) a=_mm256_sub_pd(a,b)
176
#define vsubeq_mask(mask,a,b) a=_mm256_sub_pd(a,vblend__(mask,b,vzero))
177
#define vmul(a,b) _mm256_mul_pd(a,b)
178
#define vmul_s(a,b) _mm256_mul_ps(a,b)
179
#define vmuleq(a,b) a=_mm256_mul_pd(a,b)
180
#define vmuleq_mask(mask,a,b) a=_mm256_mul_pd(a,vblend__(mask,b,vone))
181
#if (USE_FMA4)
182
#define vfmaeq(a,b,c) a=_mm256_macc_pd(b,c,a)
183
#define vfmaeq_s(a,b,c) a=_mm256_macc_ps(b,c,a)
184
#define vfmseq(a,b,c) a=_mm256_nmacc_pd(b,c,a)
185
#define vfmaaeq(a,b,c,d,e) a=_mm256_macc_pd(d,e,_mm256_macc_pd(b,c,a))
186
#define vfmaseq(a,b,c,d,e) a=_mm256_nmacc_pd(d,e,_mm256_macc_pd(b,c,a))
187
#else
188
#if (USE_FMA)
189
#define vfmaeq(a,b,c) a=_mm256_fmadd_pd(b,c,a)
190
#define vfmaeq_s(a,b,c) a=_mm256_fmadd_ps(b,c,a)
191
#define vfmseq(a,b,c) a=_mm256_fnmadd_pd(b,c,a)
192
#define vfmaaeq(a,b,c,d,e) a=_mm256_fmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
193
#define vfmaseq(a,b,c,d,e) a=_mm256_fnmadd_pd(d,e,_mm256_fmadd_pd(b,c,a))
194
#else
195
#define vfmaeq(a,b,c) a=_mm256_add_pd(a,_mm256_mul_pd(b,c))
196
#define vfmaeq_s(a,b,c) a=_mm256_add_ps(a,_mm256_mul_ps(b,c))
197
#define vfmseq(a,b,c) a=_mm256_sub_pd(a,_mm256_mul_pd(b,c))
198
#define vfmaaeq(a,b,c,d,e) \
199
a=_mm256_add_pd(a,_mm256_add_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
200
#define vfmaseq(a,b,c,d,e) \
201
a=_mm256_add_pd(a,_mm256_sub_pd(_mm256_mul_pd(b,c),_mm256_mul_pd(d,e)))
202
#endif
203
#endif
204
#define vneg(a) _mm256_xor_pd(_mm256_set1_pd(-0.),a)
205
#define vload(a) _mm256_set1_pd(a)
206
#define vload_s(a) _mm256_set1_ps(a)
207
#define vabs(a) _mm256_andnot_pd(_mm256_set1_pd(-0.),a)
208
#define vsqrt(a) _mm256_sqrt_pd(a)
209
#define vlt(a,b) _mm256_cmp_pd(a,b,_CMP_LT_OQ)
210
#define vgt(a,b) _mm256_cmp_pd(a,b,_CMP_GT_OQ)
211
#define vge(a,b) _mm256_cmp_pd(a,b,_CMP_GE_OQ)
212
#define vne(a,b) _mm256_cmp_pd(a,b,_CMP_NEQ_OQ)
213
#define vand_mask(a,b) _mm256_and_pd(a,b)
214
#define vor_mask(a,b) _mm256_or_pd(a,b)
215
#define vmin(a,b) _mm256_min_pd(a,b)
216
#define vmax(a,b) _mm256_max_pd(a,b)
217
#define vanyTrue(a) (_mm256_movemask_pd(a)!=0)
218
#define vallTrue(a) (_mm256_movemask_pd(a)==15)
219
220
#define vloadu(p) _mm256_loadu_pd(p)
221
#define vloadu_s(p) _mm256_loadu_ps(p)
222
#define vstoreu(p, v) _mm256_storeu_pd(p, v)
223
#define vstoreu_s(p, v) _mm256_storeu_ps(p, v)
224
225
#endif
226
227
#if (VLEN==8)
228
229
#include <immintrin.h>
230
231
typedef
__m512d Tv;
232
typedef
__mmask8 Tm;
233
234
#define vadd(a,b) _mm512_add_pd(a,b)
235
#define vaddeq(a,b) a=_mm512_add_pd(a,b)
236
#define vaddeq_mask(mask,a,b) a=_mm512_mask_add_pd(a,mask,a,b);
237
#define vsub(a,b) _mm512_sub_pd(a,b)
238
#define vsubeq(a,b) a=_mm512_sub_pd(a,b)
239
#define vsubeq_mask(mask,a,b) a=_mm512_mask_sub_pd(a,mask,a,b);
240
#define vmul(a,b) _mm512_mul_pd(a,b)
241
#define vmuleq(a,b) a=_mm512_mul_pd(a,b)
242
#define vmuleq_mask(mask,a,b) a=_mm512_mask_mul_pd(a,mask,a,b);
243
#define vfmaeq(a,b,c) a=_mm512_fmadd_pd(b,c,a)
244
#define vfmseq(a,b,c) a=_mm512_fnmadd_pd(b,c,a)
245
#define vfmaaeq(a,b,c,d,e) a=_mm512_fmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
246
#define vfmaseq(a,b,c,d,e) a=_mm512_fnmadd_pd(d,e,_mm512_fmadd_pd(b,c,a))
247
#define vneg(a) _mm512_mul_pd(a,_mm512_set1_pd(-1.))
248
#define vload(a) _mm512_set1_pd(a)
249
#define vabs(a) (__m512d)_mm512_andnot_epi64((__m512i)_mm512_set1_pd(-0.),(__m512i)a)
250
#define vsqrt(a) _mm512_sqrt_pd(a)
251
#define vlt(a,b) _mm512_cmplt_pd_mask(a,b)
252
#define vgt(a,b) _mm512_cmpnle_pd_mask(a,b)
253
#define vge(a,b) _mm512_cmpnlt_pd_mask(a,b)
254
#define vne(a,b) _mm512_cmpneq_pd_mask(a,b)
255
#define vand_mask(a,b) ((a)&(b))
256
#define vor_mask(a,b) ((a)|(b))
257
#define vmin(a,b) _mm512_min_pd(a,b)
258
#define vmax(a,b) _mm512_max_pd(a,b)
259
#define vanyTrue(a) (a!=0)
260
#define vallTrue(a) (a==255)
261
262
#define vzero _mm512_setzero_pd()
263
#define vone _mm512_set1_pd(1.)
264
265
#endif
266
267
#endif
sharp_vecutil.h
Generated on Mon Dec 10 2018 10:24:20 for LevelS SHT library