32 NOINLINE
static void Z(alm2map_kernel) (
const Tb cth, Y(Tbri) * restrict p1,
33 Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
34 const sharp_ylmgen_dbl2 * restrict rf,
const dcmplx * restrict alm,
42 Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
43 for (
int i=0; i<nvec; ++i)
44 lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
45 r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
46 for (
int i=0; i<nvec; ++i)
47 lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
48 r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
49 for (
int i=0; i<nvec; ++i)
50 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
51 for (
int j=0; j<njobs; ++j)
53 Tv ar2=vload(creal(alm[njobs*l+j])),
54 ai2=vload(cimag(alm[njobs*l+j])),
55 ar4=vload(creal(alm[njobs*(l+2)+j])),
56 ai4=vload(cimag(alm[njobs*(l+2)+j]));
57 for (
int i=0; i<nvec; ++i)
59 vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
60 vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
62 Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
63 ai3=vload(cimag(alm[njobs*(l+1)+j])),
64 ar1=vload(creal(alm[njobs*(l+3)+j])),
65 ai1=vload(cimag(alm[njobs*(l+3)+j]));
66 for (
int i=0; i<nvec; ++i)
68 vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
69 vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
72 r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
73 for (
int i=0; i<nvec; ++i)
74 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
80 for (
int i=0; i<nvec; ++i)
81 lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
82 - vload(rf[l].f[1])*lam_1.v[i];
83 for (
int j=0; j<njobs; ++j)
85 Tv ar=vload(creal(alm[njobs*l+j])),
86 ai=vload(cimag(alm[njobs*l+j]));
87 for (
int i=0; i<nvec; ++i)
89 p1[j].r.v[i] += lam_2.v[i]*ar;
90 p1[j].i.v[i] += lam_2.v[i]*ai;
93 for (
int i=0; i<nvec; ++i)
94 lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
95 - vload(rf[l+1].f[1])*lam_2.v[i];
96 for (
int j=0; j<njobs; ++j)
98 Tv ar=vload(creal(alm[njobs*(l+1)+j])),
99 ai=vload(cimag(alm[njobs*(l+1)+j]));
100 for (
int i=0; i<nvec; ++i)
102 p2[j].r.v[i] += lam_1.v[i]*ar;
103 p2[j].i.v[i] += lam_1.v[i]*ai;
110 for (
int j=0; j<njobs; ++j)
112 Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
113 for (
int i=0; i<nvec; ++i)
115 p1[j].r.v[i] += lam_2.v[i]*ar;
116 p1[j].i.v[i] += lam_2.v[i]*ai;
122 NOINLINE
static void Z(map2alm_kernel) (
const Tb cth,
123 const Y(Tbri) * restrict p1,
const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
124 const sharp_ylmgen_dbl2 * restrict rf,
int l,
int lmax, Tv *restrict atmp
129 for (
int i=0; i<nvec; ++i)
130 lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
131 - vload(rf[l].f[1])*lam_1.v[i];
132 for (
int j=0; j<njobs; ++j)
133 for (
int i=0; i<nvec; ++i)
135 atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
136 atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
138 for (
int i=0; i<nvec; ++i)
139 lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
140 - vload(rf[l+1].f[1])*lam_2.v[i];
141 for (
int j=0; j<njobs; ++j)
142 for (
int i=0; i<nvec; ++i)
144 atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
145 atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
151 for (
int j=0; j<njobs; ++j)
152 for (
int i=0; i<nvec; ++i)
154 atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
155 atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
160 NOINLINE
static void Z(calc_alm2map) (
const Tb cth,
const Tb sth,
161 const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
162 Y(Tbri) * restrict p2 NJ1)
164 int l,lmax=gen->lmax;
165 Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
166 Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
167 job->opcnt += (l-gen->m) * 4*VLEN*nvec;
169 job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
172 Y(getCorfac)(scale,&corfac,gen->cf);
173 const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
174 const dcmplx * restrict alm=job->almtmp;
175 int full_ieee = Y(TballGe)(scale,sharp_minscale);
178 for (
int j=0; j<njobs; ++j)
180 Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
181 for (
int i=0; i<nvec; ++i)
183 Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
184 vfmaeq(p1[j].r.v[i],tmp,ar);
185 vfmaeq(p1[j].i.v[i],tmp,ai);
189 Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
190 for (
int i=0; i<nvec; ++i)
191 lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
192 for (
int j=0; j<njobs; ++j)
194 Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
195 for (
int i=0; i<nvec; ++i)
197 Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
198 vfmaeq(p2[j].r.v[i],tmp,ar);
199 vfmaeq(p2[j].i.v[i],tmp,ai);
203 r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
204 for (
int i=0; i<nvec; ++i)
205 lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
206 if (Y(rescale)(&lam_1,&lam_2,&scale))
208 Y(getCorfac)(scale,&corfac,gen->cf);
209 full_ieee = Y(TballGe)(scale,sharp_minscale);
214 Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
215 Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
218 NOINLINE
static void Z(calc_map2alm) (
const Tb cth,
const Tb sth,
219 const sharp_Ylmgen_C *gen, sharp_job *job,
const Y(Tbri) * restrict p1,
220 const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
223 Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
225 Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
226 job->opcnt += (l-gen->m) * 4*VLEN*nvec;
228 job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
230 const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
232 Y(getCorfac)(scale,&corfac,gen->cf);
233 int full_ieee = Y(TballGe)(scale,sharp_minscale);
236 for (
int j=0; j<njobs; ++j)
237 for (
int i=0; i<nvec; ++i)
239 Tv tmp=lam_2.v[i]*corfac.v[i];
240 atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
241 atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
243 if (++l>lmax)
return;
244 for (
int i=0; i<nvec; ++i)
245 lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
246 - vload(rf[l-1].f[1])*lam_1.v[i];
247 for (
int j=0; j<njobs; ++j)
248 for (
int i=0; i<nvec; ++i)
250 Tv tmp=lam_1.v[i]*corfac.v[i];
251 atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
252 atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
254 if (++l>lmax)
return;
255 for (
int i=0; i<nvec; ++i)
256 lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
257 - vload(rf[l-1].f[1])*lam_2.v[i];
258 if (Y(rescale)(&lam_1,&lam_2,&scale))
260 Y(getCorfac)(scale,&corfac,gen->cf);
261 full_ieee = Y(TballGe)(scale,sharp_minscale);
265 Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
266 Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
269 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
270 const Tb rxp,
const Tb rxm,
const dcmplx * restrict alm NJ1)
272 for (
int j=0; j<njobs; ++j)
274 Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
275 acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
276 for (
int i=0; i<nvec; ++i)
278 Tv lw=vadd(rxp.v[i],rxm.v[i]);
279 vfmaeq(px[j].qr.v[i],agr,lw);
280 vfmaeq(px[j].qi.v[i],agi,lw);
281 vfmaeq(px[j].ur.v[i],acr,lw);
282 vfmaeq(px[j].ui.v[i],aci,lw);
284 for (
int i=0; i<nvec; ++i)
286 Tv lx=vsub(rxm.v[i],rxp.v[i]);
287 vfmseq(py[j].qr.v[i],aci,lx);
288 vfmaeq(py[j].qi.v[i],acr,lx);
289 vfmaeq(py[j].ur.v[i],agi,lx);
290 vfmseq(py[j].ui.v[i],agr,lx);
295 static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
296 const Tb r1p,
const Tb r1m,
const Tb r2p,
const Tb r2m,
297 const dcmplx * restrict alm1,
const dcmplx * restrict alm2 NJ1)
299 for (
int j=0; j<njobs; ++j)
301 Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
302 acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
303 Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
304 acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
305 for (
int i=0; i<nvec; ++i)
307 Tv lw1=r2p.v[i]+r2m.v[i];
308 Tv lx2=r1m.v[i]-r1p.v[i];
309 vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
310 vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
311 vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
312 vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
314 for (
int i=0; i<nvec; ++i)
316 Tv lx1=r2m.v[i]-r2p.v[i];
317 Tv lw2=r1p.v[i]+r1m.v[i];
318 vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
319 vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
320 vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
321 vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
326 static inline void Z(saddstep2) (
const Y(Tbqu) * restrict px,
327 const Y(Tbqu) * restrict py,
const Tb * restrict rxp,
328 const Tb * restrict rxm, dcmplx * restrict alm NJ1)
330 for (
int j=0; j<njobs; ++j)
332 Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
333 for (
int i=0; i<nvec; ++i)
335 Tv lw=vadd(rxp->v[i],rxm->v[i]);
336 vfmaeq(agr,px[j].qr.v[i],lw);
337 vfmaeq(agi,px[j].qi.v[i],lw);
338 vfmaeq(acr,px[j].ur.v[i],lw);
339 vfmaeq(aci,px[j].ui.v[i],lw);
341 for (
int i=0; i<nvec; ++i)
343 Tv lx=vsub(rxm->v[i],rxp->v[i]);
344 vfmseq(agr,py[j].ui.v[i],lx);
345 vfmaeq(agi,py[j].ur.v[i],lx);
346 vfmaeq(acr,py[j].qi.v[i],lx);
347 vfmseq(aci,py[j].qr.v[i],lx);
349 vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
353 NOINLINE
static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
354 Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
355 const sharp_ylmgen_dbl3 * restrict fx,
const dcmplx * restrict alm,
int l,
360 Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
361 fx2=vload(fx[l+1].f[2]);
362 for (
int i=0; i<nvec; ++i)
364 rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
365 rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
367 Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
368 &alm[2*njobs*(l+1)] NJ2);
369 fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
370 fx2=vload(fx[l+2].f[2]);
371 for (
int i=0; i<nvec; ++i)
373 rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
374 rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
379 Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
382 NOINLINE
static void Z(map2alm_spin_kernel) (Tb cth,
const Y(Tbqu) * restrict p1,
383 const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
384 const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm,
int l,
int lmax
389 Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
390 fx2=vload(fx[l+1].f[2]);
391 for (
int i=0; i<nvec; ++i)
393 rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
394 vmul(fx2,rec1p.v[i]));
395 rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
396 vmul(fx2,rec1m.v[i]));
398 Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
399 Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
400 fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
401 fx2=vload(fx[l+2].f[2]);
402 for (
int i=0; i<nvec; ++i)
404 rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
405 vmul(fx2,rec2p.v[i]));
406 rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
407 vmul(fx2,rec2m.v[i]));
412 Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
415 NOINLINE
static void Z(calc_alm2map_spin) (
const Tb cth,
const Tb sth,
416 const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
417 Y(Tbqu) * restrict p2 NJ1)
419 int l, lmax=gen->lmax;
420 Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
422 (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
423 job->opcnt += (l-gen->m) * 10*VLEN*nvec;
425 job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
427 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
429 Y(getCorfac)(scalep,&corfacp,gen->cf);
430 Y(getCorfac)(scalem,&corfacm,gen->cf);
431 const dcmplx * restrict alm=job->almtmp;
432 int full_ieee = Y(TballGe)(scalep,sharp_minscale)
433 && Y(TballGe)(scalem,sharp_minscale);
436 Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
437 &alm[2*njobs*l] NJ2);
439 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
440 Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
441 &alm[2*njobs*l] NJ2);
443 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
444 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
446 Y(getCorfac)(scalep,&corfacp,gen->cf);
447 Y(getCorfac)(scalem,&corfacm,gen->cf);
448 full_ieee = Y(TballGe)(scalep,sharp_minscale)
449 && Y(TballGe)(scalem,sharp_minscale);
455 Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
456 Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
457 Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
461 NOINLINE
static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
462 const sharp_Ylmgen_C * restrict gen, sharp_job *job,
463 const Y(Tbqu) * restrict p1,
const Y(Tbqu) * restrict p2 NJ1)
465 int l, lmax=gen->lmax;
466 Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
468 (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
469 job->opcnt += (l-gen->m) * 10*VLEN*nvec;
471 job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
473 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
475 Y(getCorfac)(scalep,&corfacp,gen->cf);
476 Y(getCorfac)(scalem,&corfacm,gen->cf);
477 dcmplx * restrict alm=job->almtmp;
478 int full_ieee = Y(TballGe)(scalep,sharp_minscale)
479 && Y(TballGe)(scalem,sharp_minscale);
482 Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
483 Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
484 if (++l>lmax)
return;
485 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
486 t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
487 Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
488 if (++l>lmax)
return;
489 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
490 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
492 Y(getCorfac)(scalep,&corfacp,gen->cf);
493 Y(getCorfac)(scalem,&corfacm,gen->cf);
494 full_ieee = Y(TballGe)(scalep,sharp_minscale)
495 && Y(TballGe)(scalem,sharp_minscale);
499 Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
500 Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
501 Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
504 static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
505 const Tb rxp,
const Tb rxm,
const dcmplx * restrict alm NJ1)
507 for (
int j=0; j<njobs; ++j)
509 Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
510 for (
int i=0; i<nvec; ++i)
512 Tv lw=vadd(rxp.v[i],rxm.v[i]);
513 vfmaeq(px[j].qr.v[i],ar,lw);
514 vfmaeq(px[j].qi.v[i],ai,lw);
516 for (
int i=0; i<nvec; ++i)
518 Tv lx=vsub(rxm.v[i],rxp.v[i]);
519 vfmaeq(py[j].ur.v[i],ai,lx);
520 vfmseq(py[j].ui.v[i],ar,lx);
525 NOINLINE
static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
526 Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
527 const sharp_ylmgen_dbl3 * restrict fx,
const dcmplx * restrict alm,
int l,
532 Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
533 fx2=vload(fx[l+1].f[2]);
534 for (
int i=0; i<nvec; ++i)
536 rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
537 vmul(fx2,rec1p.v[i]));
538 rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
539 vmul(fx2,rec1m.v[i]));
541 Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
542 Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
543 fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
544 fx2=vload(fx[l+2].f[2]);
545 for (
int i=0; i<nvec; ++i)
547 rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
548 vmul(fx2,rec2p.v[i]));
549 rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
550 vmul(fx2,rec2m.v[i]));
555 Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
558 NOINLINE
static void Z(calc_alm2map_deriv1) (
const Tb cth,
const Tb sth,
559 const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
560 Y(Tbqu) * restrict p2 NJ1)
562 int l, lmax=gen->lmax;
563 Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
565 (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
566 job->opcnt += (l-gen->m) * 10*VLEN*nvec;
568 job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
570 const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
572 Y(getCorfac)(scalep,&corfacp,gen->cf);
573 Y(getCorfac)(scalem,&corfacm,gen->cf);
574 const dcmplx * restrict alm=job->almtmp;
575 int full_ieee = Y(TballGe)(scalep,sharp_minscale)
576 && Y(TballGe)(scalem,sharp_minscale);
579 Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
582 Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
583 Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
586 Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
587 if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
589 Y(getCorfac)(scalep,&corfacp,gen->cf);
590 Y(getCorfac)(scalem,&corfacm,gen->cf);
591 full_ieee = Y(TballGe)(scalep,sharp_minscale)
592 && Y(TballGe)(scalem,sharp_minscale);
598 Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
599 Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
600 Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
605 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0) 607 NOINLINE
static void Z(inner_loop_a2m) (sharp_job *job,
const int *ispair,
608 const double *cth_,
const double *sth_,
int llim,
int ulim,
609 sharp_Ylmgen_C *gen,
int mi,
const int *mlim NJ1)
611 const int nval=nvec*VLEN;
612 const int m = job->ainfo->mval[mi];
622 for (
int ith=0; ith<ulim-llim; ith+=nval)
624 Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
628 for (
int i=0; i<nval; ++i)
631 if (itot>=ulim-llim) itot=ulim-llim-1;
632 if (mlim[itot]>=m) skip=0;
633 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
636 Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
638 for (
int i=0; i<nval; ++i)
643 for (
int j=0; j<njobs; ++j)
645 int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
646 complex
double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
647 r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
648 job->phase[phas_idx] = r1+r2;
650 job->phase[phas_idx+1] = r1-r2;
658 for (
int ith=0; ith<ulim-llim; ith+=nval)
660 Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
664 for (
int i=0; i<nval; ++i)
667 if (itot>=ulim-llim) itot=ulim-llim-1;
668 if (mlim[itot]>=m) skip=0;
669 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
673 Z(calc_alm2map_spin )
674 (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
675 Z(calc_alm2map_deriv1)
676 (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
678 for (
int i=0; i<nval; ++i)
683 for (
int j=0; j<njobs; ++j)
685 int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
686 complex
double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
687 q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
688 u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
689 u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
690 job->phase[phas_idx] = q1+q2;
691 job->phase[phas_idx+2] = u1+u2;
694 dcmplx *phQ = &(job->phase[phas_idx+1]),
695 *phU = &(job->phase[phas_idx+3]);
698 if ((gen->mhi-gen->m+gen->s)&1)
699 { *phQ=-(*phQ); *phU=-(*phU); }
716 NOINLINE
static void Z(inner_loop_m2a) (sharp_job *job,
const int *ispair,
717 const double *cth_,
const double *sth_,
int llim,
int ulim,
718 sharp_Ylmgen_C *gen,
int mi,
const int *mlim NJ1)
720 const int nval=nvec*VLEN;
721 const int m = job->ainfo->mval[mi];
730 Tv atmp[2*njobs*(gen->lmax+1)];
731 memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*
sizeof(Tv));
732 for (
int ith=0; ith<ulim-llim; ith+=nval)
734 Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
738 for (
int i=0; i<nval; ++i)
741 if (itot>=ulim-llim) itot=ulim-llim-1;
742 if (mlim[itot]>=m) skip=0;
743 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
744 if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
746 for (
int j=0; j<njobs; ++j)
748 int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
749 dcmplx ph1=job->phase[phas_idx];
750 dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
751 p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
752 p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
757 Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
760 int istart=m*njobs, istop=(gen->lmax+1)*njobs;
761 for(; istart<istop-2; istart+=2)
762 vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
763 for(; istart<istop; istart++)
764 job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
769 for (
int ith=0; ith<ulim-llim; ith+=nval)
771 Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
775 for (
int i=0; i<nval; ++i)
778 if (itot>=ulim-llim) itot=ulim-llim-1;
779 if (mlim[itot]>=m) skip=0;
780 cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
783 for (
int j=0; j<njobs; ++j)
785 int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
786 dcmplx p1Q=job->phase[phas_idx],
787 p1U=job->phase[phas_idx+2],
788 p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
789 p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
790 if ((gen->mhi-gen->m+gen->s)&1)
791 { p2Q=-p2Q; p2U=-p2U; }
792 p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
793 p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
794 p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
795 p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
800 Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
813 static void Z(inner_loop) (sharp_job *job,
const int *ispair,
814 const double *cth_,
const double *sth_,
int llim,
int ulim,
815 sharp_Ylmgen_C *gen,
int mi,
const int *mlim NJ1)
818 Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
819 Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
void sharp_Ylmgen_prepare(sharp_Ylmgen_C *gen, int m)