36 { Tb b;
double s[VLEN*nvec]; } Y(Tbu);
42 { Tb qr, qi, ur, ui; } Y(Tbqu);
45 {
double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
48 {
double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
51 { Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
54 { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
56 static inline Tb Y(Tbconst)(
double val)
60 for (
int i=0; i<nvec; ++i) res.v[i]=v;
64 static inline void Y(Tbmuleq1)(Tb * restrict a,
double b)
65 { Tv v=vload(b);
for (
int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
67 static inline Tb Y(Tbprod)(Tb a, Tb b)
68 { Tb r;
for (
int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]);
return r; }
70 static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
71 {
for (
int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
73 static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
76 const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
77 const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
78 for (
int i=0;i<nvec; ++i)
80 Tm mask = vgt(vabs(val->v[i]),vfmax);
81 while (vanyTrue(mask))
83 vmuleq_mask(mask,val->v[i],vfsmall);
84 vaddeq_mask(mask,scale->v[i],vone);
85 mask = vgt(vabs(val->v[i]),vfmax);
87 mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
88 while (vanyTrue(mask))
90 vmuleq_mask(mask,val->v[i],vfbig);
91 vsubeq_mask(mask,scale->v[i],vone);
92 mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
97 NOINLINE
static void Y(mypow) (Tb val,
int npow,
const double * restrict powlimit,
98 Tb * restrict resd, Tb * restrict ress)
100 Tv vminv=vload(powlimit[npow]);
101 Tm mask = vlt(vabs(val.v[0]),vminv);
102 for (
int i=1;i<nvec; ++i)
103 mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
106 Tb res=Y(Tbconst)(1.);
110 for (
int i=0; i<nvec; ++i)
112 vmuleq(res.v[i],val.v[i]);
113 vmuleq(val.v[i],val.v[i]);
116 for (
int i=0; i<nvec; ++i)
117 vmuleq(val.v[i],val.v[i]);
121 *ress=Y(Tbconst)(0.);
125 Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
126 Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
131 for (
int i=0; i<nvec; ++i)
133 vmuleq(res.v[i],val.v[i]);
134 vaddeq(scale.v[i],scaleint.v[i]);
136 Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
138 for (
int i=0; i<nvec; ++i)
140 vmuleq(val.v[i],val.v[i]);
141 vaddeq(scaleint.v[i],scaleint.v[i]);
143 Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
151 static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
155 for (
int i=0;i<nvec; ++i)
157 Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
161 vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
162 vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
163 vaddeq_mask(mask,scale->v[i],vone);
169 static inline int Y(TballLt)(Tb a,
double b)
172 Tm res=vlt(a.v[0],vb);
173 for (
int i=1; i<nvec; ++i)
174 res=vand_mask(res,vlt(a.v[i],vb));
175 return vallTrue(res);
177 static inline int Y(TballGt)(Tb a,
double b)
180 Tm res=vgt(a.v[0],vb);
181 for (
int i=1; i<nvec; ++i)
182 res=vand_mask(res,vgt(a.v[i],vb));
183 return vallTrue(res);
185 static inline int Y(TballGe)(Tb a,
double b)
188 Tm res=vge(a.v[0],vb);
189 for (
int i=1; i<nvec; ++i)
190 res=vand_mask(res,vge(a.v[i],vb));
191 return vallTrue(res);
194 static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
195 const double * restrict cf)
199 for (
int i=0; i<VLEN*nvec; ++i)
200 corf.s[i] = (sc.s[i]<sharp_minscale) ?
201 0. : cf[(int)(sc.s[i])-sharp_minscale];
205 NOINLINE
static void Y(iter_to_ieee) (
const Tb sth, Tb cth,
int *l_,
206 Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
207 const sharp_Ylmgen_C * restrict gen)
210 Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
211 Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
212 Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
213 Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
215 int below_limit = Y(TballLt)(scale,sharp_limscale);
218 if (l+2>gen->lmax) {*l_=gen->lmax+1;
return;}
219 for (
int i=0; i<nvec; ++i)
220 lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
221 - vload(gen->rf[l].f[1])*lam_1.v[i];
222 for (
int i=0; i<nvec; ++i)
223 lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
224 - vload(gen->rf[l+1].f[1])*lam_2.v[i];
225 if (Y(rescale)(&lam_1,&lam_2,&scale))
226 below_limit = Y(TballLt)(scale,sharp_limscale);
229 *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
232 static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
233 Tb * restrict ryp, Tb * restrict rym,
const Tb cth,
234 const sharp_ylmgen_dbl3 fx)
236 Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
237 for (
int i=0; i<nvec; ++i)
239 rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
240 rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
244 static void Y(iter_to_ieee_spin) (
const Tb cth,
const Tb sth,
int *l_,
245 Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
246 Tb * scalep_, Tb * scalem_,
const sharp_Ylmgen_C * restrict gen)
248 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
250 for (
int i=0; i<nvec; ++i)
252 cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
253 cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
254 sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
255 sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
256 Tm mask=vlt(sth.v[i],vzero);
257 Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
258 vmuleq_mask(cmask,cth2.v[i],vload(-1.));
259 Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
260 vmuleq_mask(smask,sth2.v[i],vload(-1.));
263 Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
264 Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
265 Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
266 Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
267 Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
269 Tb rec2p, rec2m, scalep, scalem;
270 Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
271 Tv prefac=vload(gen->prefac[gen->m]),
272 prescale=vload(gen->fscale[gen->m]);
273 for (
int i=0; i<nvec; ++i)
275 rec2p.v[i]=vmul(prefac,ccp.v[i]);
276 scalep.v[i]=vadd(prescale,ccps.v[i]);
277 rec2m.v[i]=vmul(prefac,csp.v[i]);
278 scalem.v[i]=vadd(prescale,csps.v[i]);
280 Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf);
281 Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf);
282 for (
int i=0; i<nvec; ++i)
284 rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
285 scalep.v[i]=vadd(scalep.v[i],ssps.v[i]);
286 rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
287 scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
289 rec2p.v[i]=vneg(rec2p.v[i]);
291 rec2m.v[i]=vneg(rec2m.v[i]);
293 rec2p.v[i]=vneg(rec2p.v[i]);
295 Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol);
296 Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol);
300 int below_limit = Y(TballLt)(scalep,sharp_limscale)
301 && Y(TballLt)(scalem,sharp_limscale);
304 if (l+2>gen->lmax) {*l_=gen->lmax+1;
return;}
305 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
306 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
307 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
308 below_limit = Y(TballLt)(scalep,sharp_limscale)
309 && Y(TballLt)(scalem,sharp_limscale);
314 *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
315 *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;