LevelS SHT library  3.50
sharp_complex_hacks.h
1 /*
2  * This file is part of libsharp.
3  *
4  * libsharp is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * libsharp is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with libsharp; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /*
20  * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
21  * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
22  * (DLR).
23  */
24 
25 /* \file sharp_complex_hacks.h
26  * support for converting vector types and complex numbers
27  *
28  * Copyright (C) 2012-2016 Max-Planck-Society
29  * Author: Martin Reinecke
30  */
31 
32 #ifndef SHARP_COMPLEX_HACKS_H
33 #define SHARP_COMPLEX_HACKS_H
34 
35 #ifdef __cplusplus
36 #error This header file cannot be included from C++, only from C
37 #endif
38 
39 #include <math.h>
40 #include <complex.h>
41 #include "sharp_vecsupport.h"
42 
43 #define UNSAFE_CODE
44 
45 #if (VLEN==1)
46 
47 static inline complex double vhsum_cmplx(Tv a, Tv b)
48  { return a+_Complex_I*b; }
49 
50 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
51  complex double * restrict c1, complex double * restrict c2)
52  { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
53 
54 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
55  complex double * restrict cc)
56  { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
57 
58 #endif
59 
60 #if (VLEN==2)
61 
62 static inline complex double vhsum_cmplx (Tv a, Tv b)
63  {
64 #if defined(__SSE3__)
65  Tv tmp = _mm_hadd_pd(a,b);
66 #else
67  Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
68  _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
69 #endif
70  union {Tv v; complex double c; } u;
71  u.v=tmp; return u.c;
72  }
73 
74 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
75  Tv d, complex double * restrict c1, complex double * restrict c2)
76  {
77 #ifdef UNSAFE_CODE
78 #if defined(__SSE3__)
79  vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
80  vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
81 #else
82  vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
83  _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
84  vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
85  _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
86 #endif
87 #else
88  union {Tv v; complex double c; } u1, u2;
89 #if defined(__SSE3__)
90  u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
91 #else
92  u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
93  _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
94  u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
95  _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
96 #endif
97  *c1+=u1.c; *c2+=u2.c;
98 #endif
99  }
100 
101 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
102  complex double * restrict cc)
103  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
104 
105 #endif
106 
107 #if (VLEN==4)
108 
109 static inline complex double vhsum_cmplx (Tv a, Tv b)
110  {
111  Tv tmp=_mm256_hadd_pd(a,b);
112  Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
113  tmp=_mm256_add_pd(tmp,tmp2);
114 #ifdef UNSAFE_CODE
115  complex double ret;
116  *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
117  return ret;
118 #else
119  union {Tv v; complex double c[2]; } u;
120  u.v=tmp; return u.c[0];
121 #endif
122  }
123 
124 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
125  complex double * restrict c1, complex double * restrict c2)
126  {
127  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
128  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
129  tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
130  tmp1=vadd(tmp3,tmp4);
131 #ifdef UNSAFE_CODE
132  *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
133  *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
134 #else
135  union {Tv v; complex double c[2]; } u;
136  u.v=tmp1;
137  *c1+=u.c[0]; *c2+=u.c[1];
138 #endif
139  }
140 
141 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
142  complex double * restrict cc)
143  {
144  Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
145  Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
146  tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
147  tmp1=vadd(tmp3,tmp4);
148 #ifdef UNSAFE_CODE
149  _mm256_storeu_pd((double *)cc,
150  _mm256_add_pd(_mm256_loadu_pd((double *)cc),tmp1));
151 #else
152  union {Tv v; complex double c[2]; } u;
153  u.v=tmp1;
154  cc[0]+=u.c[0]; cc[1]+=u.c[1];
155 #endif
156  }
157 
158 #endif
159 
160 #if (VLEN==8)
161 
162 static inline complex double vhsum_cmplx(Tv a, Tv b)
163  { return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
164 
165 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
166  complex double * restrict c1, complex double * restrict c2)
167  {
168  *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
169  *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
170  }
171 
172 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
173  complex double * restrict cc)
174  { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
175 
176 #endif
177 
178 #endif

Generated on Mon Dec 10 2018 10:24:20 for LevelS SHT library