LevelS SHT library  3.50
sharp_core_inc2.c
Go to the documentation of this file.
1 /*
2  * This file is part of libsharp.
3  *
4  * libsharp is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * libsharp is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with libsharp; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /*
20  * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
21  * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
22  * (DLR).
23  */
24 
25 /*! \file sharp_core_inc2.c
26  * Type-dependent code for the computational core
27  *
28  * Copyright (C) 2012-2017 Max-Planck-Society
29  * \author Martin Reinecke
30  */
31 
32 NOINLINE static void Z(alm2map_kernel) (const Tb cth, Y(Tbri) * restrict p1,
33  Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
34  const sharp_ylmgen_dbl2 * restrict rf, const dcmplx * restrict alm,
35  int l, int lmax NJ1)
36  {
37 if (njobs>1)
38  {
39  while (l<lmax-2)
40  {
41  Tb lam_3, lam_4;
42  Tv r0=vload(rf[l].f[0]),r1=vload(rf[l].f[1]);
43  for (int i=0; i<nvec; ++i)
44  lam_3.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
45  r0=vload(rf[l+1].f[0]);r1=vload(rf[l+1].f[1]);
46  for (int i=0; i<nvec; ++i)
47  lam_4.v[i] = vsub(vmul(vmul(cth.v[i],lam_3.v[i]),r0),vmul(lam_2.v[i],r1));
48  r0=vload(rf[l+2].f[0]);r1=vload(rf[l+2].f[1]);
49  for (int i=0; i<nvec; ++i)
50  lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_4.v[i]),r0),vmul(lam_3.v[i],r1));
51  for (int j=0; j<njobs; ++j)
52  {
53  Tv ar2=vload(creal(alm[njobs*l+j])),
54  ai2=vload(cimag(alm[njobs*l+j])),
55  ar4=vload(creal(alm[njobs*(l+2)+j])),
56  ai4=vload(cimag(alm[njobs*(l+2)+j]));
57  for (int i=0; i<nvec; ++i)
58  {
59  vfmaaeq(p1[j].r.v[i],lam_2.v[i],ar2,lam_4.v[i],ar4);
60  vfmaaeq(p1[j].i.v[i],lam_2.v[i],ai2,lam_4.v[i],ai4);
61  }
62  Tv ar3=vload(creal(alm[njobs*(l+1)+j])),
63  ai3=vload(cimag(alm[njobs*(l+1)+j])),
64  ar1=vload(creal(alm[njobs*(l+3)+j])),
65  ai1=vload(cimag(alm[njobs*(l+3)+j]));
66  for (int i=0; i<nvec; ++i)
67  {
68  vfmaaeq(p2[j].r.v[i],lam_3.v[i],ar3,lam_1.v[i],ar1);
69  vfmaaeq(p2[j].i.v[i],lam_3.v[i],ai3,lam_1.v[i],ai1);
70  }
71  }
72  r0=vload(rf[l+3].f[0]);r1=vload(rf[l+3].f[1]);
73  for (int i=0; i<nvec; ++i)
74  lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_4.v[i],r1));
75  l+=4;
76  }
77  }
78  while (l<lmax)
79  {
80  for (int i=0; i<nvec; ++i)
81  lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
82  - vload(rf[l].f[1])*lam_1.v[i];
83  for (int j=0; j<njobs; ++j)
84  {
85  Tv ar=vload(creal(alm[njobs*l+j])),
86  ai=vload(cimag(alm[njobs*l+j]));
87  for (int i=0; i<nvec; ++i)
88  {
89  p1[j].r.v[i] += lam_2.v[i]*ar;
90  p1[j].i.v[i] += lam_2.v[i]*ai;
91  }
92  }
93  for (int i=0; i<nvec; ++i)
94  lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
95  - vload(rf[l+1].f[1])*lam_2.v[i];
96  for (int j=0; j<njobs; ++j)
97  {
98  Tv ar=vload(creal(alm[njobs*(l+1)+j])),
99  ai=vload(cimag(alm[njobs*(l+1)+j]));
100  for (int i=0; i<nvec; ++i)
101  {
102  p2[j].r.v[i] += lam_1.v[i]*ar;
103  p2[j].i.v[i] += lam_1.v[i]*ai;
104  }
105  }
106  l+=2;
107  }
108  if (l==lmax)
109  {
110  for (int j=0; j<njobs; ++j)
111  {
112  Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
113  for (int i=0; i<nvec; ++i)
114  {
115  p1[j].r.v[i] += lam_2.v[i]*ar;
116  p1[j].i.v[i] += lam_2.v[i]*ai;
117  }
118  }
119  }
120  }
121 
122 NOINLINE static void Z(map2alm_kernel) (const Tb cth,
123  const Y(Tbri) * restrict p1, const Y(Tbri) * restrict p2, Tb lam_1, Tb lam_2,
124  const sharp_ylmgen_dbl2 * restrict rf, int l, int lmax, Tv *restrict atmp
125  NJ1)
126  {
127  while (l<lmax)
128  {
129  for (int i=0; i<nvec; ++i)
130  lam_1.v[i] = vload(rf[l].f[0])*(cth.v[i]*lam_2.v[i])
131  - vload(rf[l].f[1])*lam_1.v[i];
132  for (int j=0; j<njobs; ++j)
133  for (int i=0; i<nvec; ++i)
134  {
135  atmp[2*(l*njobs+j)]+=lam_2.v[i]*p1[j].r.v[i];
136  atmp[2*(l*njobs+j)+1]+=lam_2.v[i]*p1[j].i.v[i];
137  }
138  for (int i=0; i<nvec; ++i)
139  lam_2.v[i] = vload(rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
140  - vload(rf[l+1].f[1])*lam_2.v[i];
141  for (int j=0; j<njobs; ++j)
142  for (int i=0; i<nvec; ++i)
143  {
144  atmp[2*((l+1)*njobs+j)]+=lam_1.v[i]*p2[j].r.v[i];
145  atmp[2*((l+1)*njobs+j)+1]+=lam_1.v[i]*p2[j].i.v[i];
146  }
147  l+=2;
148  }
149  if (l==lmax)
150  {
151  for (int j=0; j<njobs; ++j)
152  for (int i=0; i<nvec; ++i)
153  {
154  atmp[2*(l*njobs+j)] += lam_2.v[i]*p1[j].r.v[i];
155  atmp[2*(l*njobs+j)+1] += lam_2.v[i]*p1[j].i.v[i];
156  }
157  }
158  }
159 
160 NOINLINE static void Z(calc_alm2map) (const Tb cth, const Tb sth,
161  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbri) * restrict p1,
162  Y(Tbri) * restrict p2 NJ1)
163  {
164  int l,lmax=gen->lmax;
165  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
166  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
167  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
168  if (l>lmax) return;
169  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
170 
171  Tb corfac;
172  Y(getCorfac)(scale,&corfac,gen->cf);
173  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
174  const dcmplx * restrict alm=job->almtmp;
175  int full_ieee = Y(TballGe)(scale,sharp_minscale);
176  while (!full_ieee)
177  {
178  for (int j=0; j<njobs; ++j)
179  {
180  Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
181  for (int i=0; i<nvec; ++i)
182  {
183  Tv tmp=vmul(lam_2.v[i],corfac.v[i]);
184  vfmaeq(p1[j].r.v[i],tmp,ar);
185  vfmaeq(p1[j].i.v[i],tmp,ai);
186  }
187  }
188  if (++l>lmax) break;
189  Tv r0=vload(rf[l-1].f[0]),r1=vload(rf[l-1].f[1]);
190  for (int i=0; i<nvec; ++i)
191  lam_1.v[i] = vsub(vmul(vmul(cth.v[i],lam_2.v[i]),r0),vmul(lam_1.v[i],r1));
192  for (int j=0; j<njobs; ++j)
193  {
194  Tv ar=vload(creal(alm[njobs*l+j])),ai=vload(cimag(alm[njobs*l+j]));
195  for (int i=0; i<nvec; ++i)
196  {
197  Tv tmp=vmul(lam_1.v[i],corfac.v[i]);
198  vfmaeq(p2[j].r.v[i],tmp,ar);
199  vfmaeq(p2[j].i.v[i],tmp,ai);
200  }
201  }
202  if (++l>lmax) break;
203  r0=vload(rf[l-1].f[0]); r1=vload(rf[l-1].f[1]);
204  for (int i=0; i<nvec; ++i)
205  lam_2.v[i] = vsub(vmul(vmul(cth.v[i],lam_1.v[i]),r0),vmul(lam_2.v[i],r1));
206  if (Y(rescale)(&lam_1,&lam_2,&scale))
207  {
208  Y(getCorfac)(scale,&corfac,gen->cf);
209  full_ieee = Y(TballGe)(scale,sharp_minscale);
210  }
211  }
212  if (l>lmax) return;
213 
214  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
215  Z(alm2map_kernel) (cth, p1, p2, lam_1, lam_2, rf, alm, l, lmax NJ2);
216  }
217 
218 NOINLINE static void Z(calc_map2alm) (const Tb cth, const Tb sth,
219  const sharp_Ylmgen_C *gen, sharp_job *job, const Y(Tbri) * restrict p1,
220  const Y(Tbri) * restrict p2, Tv *restrict atmp NJ1)
221  {
222  int lmax=gen->lmax;
223  Tb lam_1=Y(Tbconst)(0.),lam_2=Y(Tbconst)(0.),scale;
224  int l=gen->m;
225  Y(iter_to_ieee) (sth,cth,&l,&lam_1,&lam_2,&scale,gen);
226  job->opcnt += (l-gen->m) * 4*VLEN*nvec;
227  if (l>lmax) return;
228  job->opcnt += (lmax+1-l) * (4+4*njobs)*VLEN*nvec;
229 
230  const sharp_ylmgen_dbl2 * restrict rf = gen->rf;
231  Tb corfac;
232  Y(getCorfac)(scale,&corfac,gen->cf);
233  int full_ieee = Y(TballGe)(scale,sharp_minscale);
234  while (!full_ieee)
235  {
236  for (int j=0; j<njobs; ++j)
237  for (int i=0; i<nvec; ++i)
238  {
239  Tv tmp=lam_2.v[i]*corfac.v[i];
240  atmp[2*(l*njobs+j)]+=tmp*p1[j].r.v[i];
241  atmp[2*(l*njobs+j)+1]+=tmp*p1[j].i.v[i];
242  }
243  if (++l>lmax) return;
244  for (int i=0; i<nvec; ++i)
245  lam_1.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_2.v[i])
246  - vload(rf[l-1].f[1])*lam_1.v[i];
247  for (int j=0; j<njobs; ++j)
248  for (int i=0; i<nvec; ++i)
249  {
250  Tv tmp=lam_1.v[i]*corfac.v[i];
251  atmp[2*(l*njobs+j)]+=tmp*p2[j].r.v[i];
252  atmp[2*(l*njobs+j)+1]+=tmp*p2[j].i.v[i];
253  }
254  if (++l>lmax) return;
255  for (int i=0; i<nvec; ++i)
256  lam_2.v[i] = vload(rf[l-1].f[0])*(cth.v[i]*lam_1.v[i])
257  - vload(rf[l-1].f[1])*lam_2.v[i];
258  if (Y(rescale)(&lam_1,&lam_2,&scale))
259  {
260  Y(getCorfac)(scale,&corfac,gen->cf);
261  full_ieee = Y(TballGe)(scale,sharp_minscale);
262  }
263  }
264 
265  Y(Tbmuleq)(&lam_1,corfac); Y(Tbmuleq)(&lam_2,corfac);
266  Z(map2alm_kernel) (cth, p1, p2, lam_1, lam_2, rf, l, lmax, atmp NJ2);
267  }
268 
269 static inline void Z(saddstep) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
270  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
271  {
272  for (int j=0; j<njobs; ++j)
273  {
274  Tv agr=vload(creal(alm[2*j])), agi=vload(cimag(alm[2*j])),
275  acr=vload(creal(alm[2*j+1])), aci=vload(cimag(alm[2*j+1]));
276  for (int i=0; i<nvec; ++i)
277  {
278  Tv lw=vadd(rxp.v[i],rxm.v[i]);
279  vfmaeq(px[j].qr.v[i],agr,lw);
280  vfmaeq(px[j].qi.v[i],agi,lw);
281  vfmaeq(px[j].ur.v[i],acr,lw);
282  vfmaeq(px[j].ui.v[i],aci,lw);
283  }
284  for (int i=0; i<nvec; ++i)
285  {
286  Tv lx=vsub(rxm.v[i],rxp.v[i]);
287  vfmseq(py[j].qr.v[i],aci,lx);
288  vfmaeq(py[j].qi.v[i],acr,lx);
289  vfmaeq(py[j].ur.v[i],agi,lx);
290  vfmseq(py[j].ui.v[i],agr,lx);
291  }
292  }
293  }
294 
295 static inline void Z(saddstepb) (Y(Tbqu) * restrict p1, Y(Tbqu) * restrict p2,
296  const Tb r1p, const Tb r1m, const Tb r2p, const Tb r2m,
297  const dcmplx * restrict alm1, const dcmplx * restrict alm2 NJ1)
298  {
299  for (int j=0; j<njobs; ++j)
300  {
301  Tv agr1=vload(creal(alm1[2*j])), agi1=vload(cimag(alm1[2*j])),
302  acr1=vload(creal(alm1[2*j+1])), aci1=vload(cimag(alm1[2*j+1]));
303  Tv agr2=vload(creal(alm2[2*j])), agi2=vload(cimag(alm2[2*j])),
304  acr2=vload(creal(alm2[2*j+1])), aci2=vload(cimag(alm2[2*j+1]));
305  for (int i=0; i<nvec; ++i)
306  {
307  Tv lw1=r2p.v[i]+r2m.v[i];
308  Tv lx2=r1m.v[i]-r1p.v[i];
309  vfmaseq(p1[j].qr.v[i],agr1,lw1,aci2,lx2);
310  vfmaaeq(p1[j].qi.v[i],agi1,lw1,acr2,lx2);
311  vfmaaeq(p1[j].ur.v[i],acr1,lw1,agi2,lx2);
312  vfmaseq(p1[j].ui.v[i],aci1,lw1,agr2,lx2);
313  }
314  for (int i=0; i<nvec; ++i)
315  {
316  Tv lx1=r2m.v[i]-r2p.v[i];
317  Tv lw2=r1p.v[i]+r1m.v[i];
318  vfmaseq(p2[j].qr.v[i],agr2,lw2,aci1,lx1);
319  vfmaaeq(p2[j].qi.v[i],agi2,lw2,acr1,lx1);
320  vfmaaeq(p2[j].ur.v[i],acr2,lw2,agi1,lx1);
321  vfmaseq(p2[j].ui.v[i],aci2,lw2,agr1,lx1);
322  }
323  }
324  }
325 
326 static inline void Z(saddstep2) (const Y(Tbqu) * restrict px,
327  const Y(Tbqu) * restrict py, const Tb * restrict rxp,
328  const Tb * restrict rxm, dcmplx * restrict alm NJ1)
329  {
330  for (int j=0; j<njobs; ++j)
331  {
332  Tv agr=vzero, agi=vzero, acr=vzero, aci=vzero;
333  for (int i=0; i<nvec; ++i)
334  {
335  Tv lw=vadd(rxp->v[i],rxm->v[i]);
336  vfmaeq(agr,px[j].qr.v[i],lw);
337  vfmaeq(agi,px[j].qi.v[i],lw);
338  vfmaeq(acr,px[j].ur.v[i],lw);
339  vfmaeq(aci,px[j].ui.v[i],lw);
340  }
341  for (int i=0; i<nvec; ++i)
342  {
343  Tv lx=vsub(rxm->v[i],rxp->v[i]);
344  vfmseq(agr,py[j].ui.v[i],lx);
345  vfmaeq(agi,py[j].ur.v[i],lx);
346  vfmaeq(acr,py[j].qi.v[i],lx);
347  vfmseq(aci,py[j].qr.v[i],lx);
348  }
349  vhsum_cmplx_special(agr,agi,acr,aci,&alm[2*j]);
350  }
351  }
352 
353 NOINLINE static void Z(alm2map_spin_kernel) (Tb cth, Y(Tbqu) * restrict p1,
354  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
355  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
356  int lmax NJ1)
357  {
358  while (l<lmax)
359  {
360  Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
361  fx2=vload(fx[l+1].f[2]);
362  for (int i=0; i<nvec; ++i)
363  {
364  rec1p.v[i] = (cth.v[i]-fx1)*fx0*rec2p.v[i] - fx2*rec1p.v[i];
365  rec1m.v[i] = (cth.v[i]+fx1)*fx0*rec2m.v[i] - fx2*rec1m.v[i];
366  }
367  Z(saddstepb)(p1,p2,rec1p,rec1m,rec2p,rec2m,&alm[2*njobs*l],
368  &alm[2*njobs*(l+1)] NJ2);
369  fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
370  fx2=vload(fx[l+2].f[2]);
371  for (int i=0; i<nvec; ++i)
372  {
373  rec2p.v[i] = (cth.v[i]-fx1)*fx0*rec1p.v[i] - fx2*rec2p.v[i];
374  rec2m.v[i] = (cth.v[i]+fx1)*fx0*rec1m.v[i] - fx2*rec2m.v[i];
375  }
376  l+=2;
377  }
378  if (l==lmax)
379  Z(saddstep)(p1, p2, rec2p, rec2m, &alm[2*njobs*l] NJ2);
380  }
381 
382 NOINLINE static void Z(map2alm_spin_kernel) (Tb cth, const Y(Tbqu) * restrict p1,
383  const Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
384  const sharp_ylmgen_dbl3 * restrict fx, dcmplx * restrict alm, int l, int lmax
385  NJ1)
386  {
387  while (l<lmax)
388  {
389  Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
390  fx2=vload(fx[l+1].f[2]);
391  for (int i=0; i<nvec; ++i)
392  {
393  rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
394  vmul(fx2,rec1p.v[i]));
395  rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
396  vmul(fx2,rec1m.v[i]));
397  }
398  Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
399  Z(saddstep2)(p2, p1, &rec1p, &rec1m, &alm[2*njobs*(l+1)] NJ2);
400  fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
401  fx2=vload(fx[l+2].f[2]);
402  for (int i=0; i<nvec; ++i)
403  {
404  rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
405  vmul(fx2,rec2p.v[i]));
406  rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
407  vmul(fx2,rec2m.v[i]));
408  }
409  l+=2;
410  }
411  if (l==lmax)
412  Z(saddstep2)(p1, p2, &rec2p, &rec2m, &alm[2*njobs*l] NJ2);
413  }
414 
415 NOINLINE static void Z(calc_alm2map_spin) (const Tb cth, const Tb sth,
416  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
417  Y(Tbqu) * restrict p2 NJ1)
418  {
419  int l, lmax=gen->lmax;
420  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
421  Y(iter_to_ieee_spin)
422  (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
423  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
424  if (l>lmax) return;
425  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
426 
427  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
428  Tb corfacp,corfacm;
429  Y(getCorfac)(scalep,&corfacp,gen->cf);
430  Y(getCorfac)(scalem,&corfacm,gen->cf);
431  const dcmplx * restrict alm=job->almtmp;
432  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
433  && Y(TballGe)(scalem,sharp_minscale);
434  while (!full_ieee)
435  {
436  Z(saddstep)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
437  &alm[2*njobs*l] NJ2);
438  if (++l>lmax) break;
439  Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
440  Z(saddstep)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
441  &alm[2*njobs*l] NJ2);
442  if (++l>lmax) break;
443  Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
444  if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
445  {
446  Y(getCorfac)(scalep,&corfacp,gen->cf);
447  Y(getCorfac)(scalem,&corfacm,gen->cf);
448  full_ieee = Y(TballGe)(scalep,sharp_minscale)
449  && Y(TballGe)(scalem,sharp_minscale);
450  }
451  }
452 
453  if (l>lmax) return;
454 
455  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
456  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
457  Z(alm2map_spin_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
458  lmax NJ2);
459  }
460 
461 NOINLINE static void Z(calc_map2alm_spin) (Tb cth, Tb sth,
462  const sharp_Ylmgen_C * restrict gen, sharp_job *job,
463  const Y(Tbqu) * restrict p1, const Y(Tbqu) * restrict p2 NJ1)
464  {
465  int l, lmax=gen->lmax;
466  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
467  Y(iter_to_ieee_spin)
468  (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
469  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
470  if (l>lmax) return;
471  job->opcnt += (lmax+1-l) * (12+16*njobs)*VLEN*nvec;
472 
473  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
474  Tb corfacp,corfacm;
475  Y(getCorfac)(scalep,&corfacp,gen->cf);
476  Y(getCorfac)(scalem,&corfacm,gen->cf);
477  dcmplx * restrict alm=job->almtmp;
478  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
479  && Y(TballGe)(scalem,sharp_minscale);
480  while (!full_ieee)
481  {
482  Tb t1=Y(Tbprod)(rec2p,corfacp), t2=Y(Tbprod)(rec2m,corfacm);
483  Z(saddstep2)(p1, p2, &t1, &t2, &alm[2*njobs*l] NJ2);
484  if (++l>lmax) return;
485  Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
486  t1=Y(Tbprod)(rec1p,corfacp); t2=Y(Tbprod)(rec1m,corfacm);
487  Z(saddstep2)(p2, p1, &t1, &t2, &alm[2*njobs*l] NJ2);
488  if (++l>lmax) return;
489  Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
490  if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
491  {
492  Y(getCorfac)(scalep,&corfacp,gen->cf);
493  Y(getCorfac)(scalem,&corfacm,gen->cf);
494  full_ieee = Y(TballGe)(scalep,sharp_minscale)
495  && Y(TballGe)(scalem,sharp_minscale);
496  }
497  }
498 
499  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
500  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
501  Z(map2alm_spin_kernel)(cth,p1,p2,rec1p,rec1m,rec2p,rec2m,fx,alm,l,lmax NJ2);
502  }
503 
504 static inline void Z(saddstep_d) (Y(Tbqu) * restrict px, Y(Tbqu) * restrict py,
505  const Tb rxp, const Tb rxm, const dcmplx * restrict alm NJ1)
506  {
507  for (int j=0; j<njobs; ++j)
508  {
509  Tv ar=vload(creal(alm[j])), ai=vload(cimag(alm[j]));
510  for (int i=0; i<nvec; ++i)
511  {
512  Tv lw=vadd(rxp.v[i],rxm.v[i]);
513  vfmaeq(px[j].qr.v[i],ar,lw);
514  vfmaeq(px[j].qi.v[i],ai,lw);
515  }
516  for (int i=0; i<nvec; ++i)
517  {
518  Tv lx=vsub(rxm.v[i],rxp.v[i]);
519  vfmaeq(py[j].ur.v[i],ai,lx);
520  vfmseq(py[j].ui.v[i],ar,lx);
521  }
522  }
523  }
524 
525 NOINLINE static void Z(alm2map_deriv1_kernel) (Tb cth, Y(Tbqu) * restrict p1,
526  Y(Tbqu) * restrict p2, Tb rec1p, Tb rec1m, Tb rec2p, Tb rec2m,
527  const sharp_ylmgen_dbl3 * restrict fx, const dcmplx * restrict alm, int l,
528  int lmax NJ1)
529  {
530  while (l<lmax)
531  {
532  Tv fx0=vload(fx[l+1].f[0]),fx1=vload(fx[l+1].f[1]),
533  fx2=vload(fx[l+1].f[2]);
534  for (int i=0; i<nvec; ++i)
535  {
536  rec1p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec2p.v[i])),
537  vmul(fx2,rec1p.v[i]));
538  rec1m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec2m.v[i])),
539  vmul(fx2,rec1m.v[i]));
540  }
541  Z(saddstep_d)(p1,p2,rec2p,rec2m,&alm[njobs*l] NJ2);
542  Z(saddstep_d)(p2,p1,rec1p,rec1m,&alm[njobs*(l+1)] NJ2);
543  fx0=vload(fx[l+2].f[0]);fx1=vload(fx[l+2].f[1]);
544  fx2=vload(fx[l+2].f[2]);
545  for (int i=0; i<nvec; ++i)
546  {
547  rec2p.v[i] = vsub(vmul(vsub(cth.v[i],fx1),vmul(fx0,rec1p.v[i])),
548  vmul(fx2,rec2p.v[i]));
549  rec2m.v[i] = vsub(vmul(vadd(cth.v[i],fx1),vmul(fx0,rec1m.v[i])),
550  vmul(fx2,rec2m.v[i]));
551  }
552  l+=2;
553  }
554  if (l==lmax)
555  Z(saddstep_d)(p1, p2, rec2p, rec2m, &alm[njobs*l] NJ2);
556  }
557 
558 NOINLINE static void Z(calc_alm2map_deriv1) (const Tb cth, const Tb sth,
559  const sharp_Ylmgen_C *gen, sharp_job *job, Y(Tbqu) * restrict p1,
560  Y(Tbqu) * restrict p2 NJ1)
561  {
562  int l, lmax=gen->lmax;
563  Tb rec1p, rec1m, rec2p, rec2m, scalem, scalep;
564  Y(iter_to_ieee_spin)
565  (cth,sth,&l,&rec1p,&rec1m,&rec2p,&rec2m,&scalep,&scalem,gen);
566  job->opcnt += (l-gen->m) * 10*VLEN*nvec;
567  if (l>lmax) return;
568  job->opcnt += (lmax+1-l) * (12+8*njobs)*VLEN*nvec;
569 
570  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
571  Tb corfacp,corfacm;
572  Y(getCorfac)(scalep,&corfacp,gen->cf);
573  Y(getCorfac)(scalem,&corfacm,gen->cf);
574  const dcmplx * restrict alm=job->almtmp;
575  int full_ieee = Y(TballGe)(scalep,sharp_minscale)
576  && Y(TballGe)(scalem,sharp_minscale);
577  while (!full_ieee)
578  {
579  Z(saddstep_d)(p1, p2, Y(Tbprod)(rec2p,corfacp), Y(Tbprod)(rec2m,corfacm),
580  &alm[njobs*l] NJ2);
581  if (++l>lmax) break;
582  Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l]);
583  Z(saddstep_d)(p2, p1, Y(Tbprod)(rec1p,corfacp), Y(Tbprod)(rec1m,corfacm),
584  &alm[njobs*l] NJ2);
585  if (++l>lmax) break;
586  Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l]);
587  if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
588  {
589  Y(getCorfac)(scalep,&corfacp,gen->cf);
590  Y(getCorfac)(scalem,&corfacm,gen->cf);
591  full_ieee = Y(TballGe)(scalep,sharp_minscale)
592  && Y(TballGe)(scalem,sharp_minscale);
593  }
594  }
595 
596  if (l>lmax) return;
597 
598  Y(Tbmuleq)(&rec1p,corfacp); Y(Tbmuleq)(&rec2p,corfacp);
599  Y(Tbmuleq)(&rec1m,corfacm); Y(Tbmuleq)(&rec2m,corfacm);
600  Z(alm2map_deriv1_kernel) (cth, p1, p2, rec1p, rec1m, rec2p, rec2m, fx, alm, l,
601  lmax NJ2);
602  }
603 
604 
605 #define VZERO(var) do { memset(&(var),0,sizeof(var)); } while(0)
606 
607 NOINLINE static void Z(inner_loop_a2m) (sharp_job *job, const int *ispair,
608  const double *cth_, const double *sth_, int llim, int ulim,
609  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
610  {
611  const int nval=nvec*VLEN;
612  const int m = job->ainfo->mval[mi];
613  sharp_Ylmgen_prepare (gen, m);
614 
615  switch (job->type)
616  {
617  case SHARP_ALM2MAP:
619  {
620  if (job->spin==0)
621  {
622  for (int ith=0; ith<ulim-llim; ith+=nval)
623  {
624  Y(Tburi) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
625  Y(Tbu) cth, sth;
626 
627  int skip=1;
628  for (int i=0; i<nval; ++i)
629  {
630  int itot=i+ith;
631  if (itot>=ulim-llim) itot=ulim-llim-1;
632  if (mlim[itot]>=m) skip=0;
633  cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
634  }
635  if (!skip)
636  Z(calc_alm2map) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
637 
638  for (int i=0; i<nval; ++i)
639  {
640  int itot=i+ith;
641  if (itot<ulim-llim)
642  {
643  for (int j=0; j<njobs; ++j)
644  {
645  int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
646  complex double r1 = p1[j].s.r[i] + p1[j].s.i[i]*_Complex_I,
647  r2 = p2[j].s.r[i] + p2[j].s.i[i]*_Complex_I;
648  job->phase[phas_idx] = r1+r2;
649  if (ispair[itot])
650  job->phase[phas_idx+1] = r1-r2;
651  }
652  }
653  }
654  }
655  }
656  else
657  {
658  for (int ith=0; ith<ulim-llim; ith+=nval)
659  {
660  Y(Tbuqu) p1[njobs],p2[njobs]; VZERO(p1); VZERO(p2);
661  Y(Tbu) cth, sth;
662  int skip=1;
663 
664  for (int i=0; i<nval; ++i)
665  {
666  int itot=i+ith;
667  if (itot>=ulim-llim) itot=ulim-llim-1;
668  if (mlim[itot]>=m) skip=0;
669  cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
670  }
671  if (!skip)
672  (job->type==SHARP_ALM2MAP) ?
673  Z(calc_alm2map_spin )
674  (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2) :
675  Z(calc_alm2map_deriv1)
676  (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
677 
678  for (int i=0; i<nval; ++i)
679  {
680  int itot=i+ith;
681  if (itot<ulim-llim)
682  {
683  for (int j=0; j<njobs; ++j)
684  {
685  int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
686  complex double q1 = p1[j].s.qr[i] + p1[j].s.qi[i]*_Complex_I,
687  q2 = p2[j].s.qr[i] + p2[j].s.qi[i]*_Complex_I,
688  u1 = p1[j].s.ur[i] + p1[j].s.ui[i]*_Complex_I,
689  u2 = p2[j].s.ur[i] + p2[j].s.ui[i]*_Complex_I;
690  job->phase[phas_idx] = q1+q2;
691  job->phase[phas_idx+2] = u1+u2;
692  if (ispair[itot])
693  {
694  dcmplx *phQ = &(job->phase[phas_idx+1]),
695  *phU = &(job->phase[phas_idx+3]);
696  *phQ = q1-q2;
697  *phU = u1-u2;
698  if ((gen->mhi-gen->m+gen->s)&1)
699  { *phQ=-(*phQ); *phU=-(*phU); }
700  }
701  }
702  }
703  }
704  }
705  }
706  break;
707  }
708  default:
709  {
710  UTIL_FAIL("must not happen");
711  break;
712  }
713  }
714  }
715 
716 NOINLINE static void Z(inner_loop_m2a) (sharp_job *job, const int *ispair,
717  const double *cth_, const double *sth_, int llim, int ulim,
718  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
719  {
720  const int nval=nvec*VLEN;
721  const int m = job->ainfo->mval[mi];
722  sharp_Ylmgen_prepare (gen, m);
723 
724  switch (job->type)
725  {
726  case SHARP_MAP2ALM:
727  {
728  if (job->spin==0)
729  {
730  Tv atmp[2*njobs*(gen->lmax+1)];
731  memset (&atmp[2*njobs*m],0,2*njobs*(gen->lmax+1-m)*sizeof(Tv));
732  for (int ith=0; ith<ulim-llim; ith+=nval)
733  {
734  Y(Tburi) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
735  Y(Tbu) cth, sth;
736  int skip=1;
737 
738  for (int i=0; i<nval; ++i)
739  {
740  int itot=i+ith;
741  if (itot>=ulim-llim) itot=ulim-llim-1;
742  if (mlim[itot]>=m) skip=0;
743  cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
744  if ((i+ith<ulim-llim)&&(mlim[itot]>=m))
745  {
746  for (int j=0; j<njobs; ++j)
747  {
748  int phas_idx = itot*job->s_th + mi*job->s_m + 2*j;
749  dcmplx ph1=job->phase[phas_idx];
750  dcmplx ph2=ispair[itot] ? job->phase[phas_idx+1] : 0.;
751  p1[j].s.r[i]=creal(ph1+ph2); p1[j].s.i[i]=cimag(ph1+ph2);
752  p2[j].s.r[i]=creal(ph1-ph2); p2[j].s.i[i]=cimag(ph1-ph2);
753  }
754  }
755  }
756  if (!skip)
757  Z(calc_map2alm)(cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b, atmp NJ2);
758  }
759  {
760  int istart=m*njobs, istop=(gen->lmax+1)*njobs;
761  for(; istart<istop-2; istart+=2)
762  vhsum_cmplx_special(atmp[2*istart],atmp[2*istart+1],atmp[2*istart+2],atmp[2*istart+3],&(job->almtmp[istart]));
763  for(; istart<istop; istart++)
764  job->almtmp[istart]+=vhsum_cmplx(atmp[2*istart],atmp[2*istart+1]);
765  }
766  }
767  else
768  {
769  for (int ith=0; ith<ulim-llim; ith+=nval)
770  {
771  Y(Tbuqu) p1[njobs], p2[njobs]; VZERO(p1); VZERO(p2);
772  Y(Tbu) cth, sth;
773  int skip=1;
774 
775  for (int i=0; i<nval; ++i)
776  {
777  int itot=i+ith;
778  if (itot>=ulim-llim) itot=ulim-llim-1;
779  if (mlim[itot]>=m) skip=0;
780  cth.s[i]=cth_[itot]; sth.s[i]=sth_[itot];
781  if (i+ith<ulim-llim)
782  {
783  for (int j=0; j<njobs; ++j)
784  {
785  int phas_idx = itot*job->s_th + mi*job->s_m + 4*j;
786  dcmplx p1Q=job->phase[phas_idx],
787  p1U=job->phase[phas_idx+2],
788  p2Q=ispair[itot] ? job->phase[phas_idx+1]:0.,
789  p2U=ispair[itot] ? job->phase[phas_idx+3]:0.;
790  if ((gen->mhi-gen->m+gen->s)&1)
791  { p2Q=-p2Q; p2U=-p2U; }
792  p1[j].s.qr[i]=creal(p1Q+p2Q); p1[j].s.qi[i]=cimag(p1Q+p2Q);
793  p1[j].s.ur[i]=creal(p1U+p2U); p1[j].s.ui[i]=cimag(p1U+p2U);
794  p2[j].s.qr[i]=creal(p1Q-p2Q); p2[j].s.qi[i]=cimag(p1Q-p2Q);
795  p2[j].s.ur[i]=creal(p1U-p2U); p2[j].s.ui[i]=cimag(p1U-p2U);
796  }
797  }
798  }
799  if (!skip)
800  Z(calc_map2alm_spin) (cth.b,sth.b,gen,job,&p1[0].b,&p2[0].b NJ2);
801  }
802  }
803  break;
804  }
805  default:
806  {
807  UTIL_FAIL("must not happen");
808  break;
809  }
810  }
811  }
812 
813 static void Z(inner_loop) (sharp_job *job, const int *ispair,
814  const double *cth_, const double *sth_, int llim, int ulim,
815  sharp_Ylmgen_C *gen, int mi, const int *mlim NJ1)
816  {
817  (job->type==SHARP_MAP2ALM) ?
818  Z(inner_loop_m2a)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2) :
819  Z(inner_loop_a2m)(job,ispair,cth_,sth_,llim,ulim,gen,mi,mlim NJ2);
820  }
821 
822 #undef VZERO
#define UTIL_FAIL(msg)
void sharp_Ylmgen_prepare(sharp_Ylmgen_C *gen, int m)

Generated on Mon Dec 10 2018 10:24:20 for LevelS SHT library