32 #ifndef SHARP_COMPLEX_HACKS_H 33 #define SHARP_COMPLEX_HACKS_H 36 #error This header file cannot be included from C++, only from C 41 #include "sharp_vecsupport.h" 47 static inline complex
double vhsum_cmplx(Tv a, Tv b)
48 {
return a+_Complex_I*b; }
50 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
51 complex
double * restrict c1, complex
double * restrict c2)
52 { *c1 += a+_Complex_I*b; *c2 += c+_Complex_I*d; }
54 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
55 complex
double * restrict cc)
56 { cc[0] += a+_Complex_I*b; cc[1] += c+_Complex_I*d; }
62 static inline complex
double vhsum_cmplx (Tv a, Tv b)
65 Tv tmp = _mm_hadd_pd(a,b);
67 Tv tmp = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
68 _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
70 union {Tv v; complex
double c; } u;
74 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c,
75 Tv d, complex
double * restrict c1, complex
double * restrict c2)
79 vaddeq(*((__m128d *)c1),_mm_hadd_pd(a,b));
80 vaddeq(*((__m128d *)c2),_mm_hadd_pd(c,d));
82 vaddeq(*((__m128d *)c1),vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
83 _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0))));
84 vaddeq(*((__m128d *)c2),vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
85 _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0))));
88 union {Tv v; complex
double c; } u1, u2;
90 u1.v = _mm_hadd_pd(a,b); u2.v=_mm_hadd_pd(c,d);
92 u1.v = vadd(_mm_shuffle_pd(a,b,_MM_SHUFFLE2(0,1)),
93 _mm_shuffle_pd(a,b,_MM_SHUFFLE2(1,0)));
94 u2.v = vadd(_mm_shuffle_pd(c,d,_MM_SHUFFLE2(0,1)),
95 _mm_shuffle_pd(c,d,_MM_SHUFFLE2(1,0)));
101 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
102 complex
double * restrict cc)
103 { vhsum_cmplx2(a,b,c,d,cc,cc+1); }
109 static inline complex
double vhsum_cmplx (Tv a, Tv b)
111 Tv tmp=_mm256_hadd_pd(a,b);
112 Tv tmp2=_mm256_permute2f128_pd(tmp,tmp,1);
113 tmp=_mm256_add_pd(tmp,tmp2);
116 *((__m128d *)&ret)=_mm256_extractf128_pd(tmp, 0);
119 union {Tv v; complex
double c[2]; } u;
120 u.v=tmp;
return u.c[0];
124 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
125 complex
double * restrict c1, complex
double * restrict c2)
127 Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
128 Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
129 tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
130 tmp1=vadd(tmp3,tmp4);
132 *((__m128d *)c1)=_mm_add_pd(*((__m128d *)c1),_mm256_extractf128_pd(tmp1, 0));
133 *((__m128d *)c2)=_mm_add_pd(*((__m128d *)c2),_mm256_extractf128_pd(tmp1, 1));
135 union {Tv v; complex
double c[2]; } u;
137 *c1+=u.c[0]; *c2+=u.c[1];
141 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
142 complex
double * restrict cc)
144 Tv tmp1=_mm256_hadd_pd(a,b), tmp2=_mm256_hadd_pd(c,d);
145 Tv tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49),
146 tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32);
147 tmp1=vadd(tmp3,tmp4);
149 _mm256_storeu_pd((
double *)cc,
150 _mm256_add_pd(_mm256_loadu_pd((
double *)cc),tmp1));
152 union {Tv v; complex
double c[2]; } u;
154 cc[0]+=u.c[0]; cc[1]+=u.c[1];
162 static inline complex
double vhsum_cmplx(Tv a, Tv b)
163 {
return _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b); }
165 static inline void vhsum_cmplx2 (Tv a, Tv b, Tv c, Tv d,
166 complex
double * restrict c1, complex
double * restrict c2)
168 *c1 += _mm512_reduce_add_pd(a)+_Complex_I*_mm512_reduce_add_pd(b);
169 *c2 += _mm512_reduce_add_pd(c)+_Complex_I*_mm512_reduce_add_pd(d);
172 static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d,
173 complex
double * restrict cc)
174 { vhsum_cmplx2(a,b,c,d,cc,cc+1); }