LevelS SHT library  3.50
sharp_core_inc.c
Go to the documentation of this file.
1 /*
2  * This file is part of libsharp.
3  *
4  * libsharp is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * libsharp is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with libsharp; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /*
20  * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
21  * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
22  * (DLR).
23  */
24 
25 /*! \file sharp_core_inc.c
26  * Type-dependent code for the computational core
27  *
28  * Copyright (C) 2012-2017 Max-Planck-Society
29  * \author Martin Reinecke
30  */
31 
32 typedef struct
33  { Tv v[nvec]; } Tb;
34 
35 typedef union
36  { Tb b; double s[VLEN*nvec]; } Y(Tbu);
37 
38 typedef struct
39  { Tb r, i; } Y(Tbri);
40 
41 typedef struct
42  { Tb qr, qi, ur, ui; } Y(Tbqu);
43 
44 typedef struct
45  { double r[VLEN*nvec], i[VLEN*nvec]; } Y(Tsri);
46 
47 typedef struct
48  { double qr[VLEN*nvec],qi[VLEN*nvec],ur[VLEN*nvec],ui[VLEN*nvec]; } Y(Tsqu);
49 
50 typedef union
51  { Y(Tbri) b; Y(Tsri)s; } Y(Tburi);
52 
53 typedef union
54  { Y(Tbqu) b; Y(Tsqu)s; } Y(Tbuqu);
55 
56 static inline Tb Y(Tbconst)(double val)
57  {
58  Tv v=vload(val);
59  Tb res;
60  for (int i=0; i<nvec; ++i) res.v[i]=v;
61  return res;
62  }
63 
64 static inline void Y(Tbmuleq1)(Tb * restrict a, double b)
65  { Tv v=vload(b); for (int i=0; i<nvec; ++i) vmuleq(a->v[i],v); }
66 
67 static inline Tb Y(Tbprod)(Tb a, Tb b)
68  { Tb r; for (int i=0; i<nvec; ++i) r.v[i]=vmul(a.v[i],b.v[i]); return r; }
69 
70 static inline void Y(Tbmuleq)(Tb * restrict a, Tb b)
71  { for (int i=0; i<nvec; ++i) vmuleq(a->v[i],b.v[i]); }
72 
73 static void Y(Tbnormalize) (Tb * restrict val, Tb * restrict scale,
74  double maxval)
75  {
76  const Tv vfmin=vload(sharp_fsmall*maxval), vfmax=vload(maxval);
77  const Tv vfsmall=vload(sharp_fsmall), vfbig=vload(sharp_fbig);
78  for (int i=0;i<nvec; ++i)
79  {
80  Tm mask = vgt(vabs(val->v[i]),vfmax);
81  while (vanyTrue(mask))
82  {
83  vmuleq_mask(mask,val->v[i],vfsmall);
84  vaddeq_mask(mask,scale->v[i],vone);
85  mask = vgt(vabs(val->v[i]),vfmax);
86  }
87  mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
88  while (vanyTrue(mask))
89  {
90  vmuleq_mask(mask,val->v[i],vfbig);
91  vsubeq_mask(mask,scale->v[i],vone);
92  mask = vand_mask(vlt(vabs(val->v[i]),vfmin),vne(val->v[i],vzero));
93  }
94  }
95  }
96 
97 NOINLINE static void Y(mypow) (Tb val, int npow, const double * restrict powlimit,
98  Tb * restrict resd, Tb * restrict ress)
99  {
100  Tv vminv=vload(powlimit[npow]);
101  Tm mask = vlt(vabs(val.v[0]),vminv);
102  for (int i=1;i<nvec; ++i)
103  mask=vor_mask(mask,vlt(vabs(val.v[i]),vminv));
104  if (!vanyTrue(mask)) // no underflows possible, use quick algoritm
105  {
106  Tb res=Y(Tbconst)(1.);
107  do
108  {
109  if (npow&1)
110  for (int i=0; i<nvec; ++i)
111  {
112  vmuleq(res.v[i],val.v[i]);
113  vmuleq(val.v[i],val.v[i]);
114  }
115  else
116  for (int i=0; i<nvec; ++i)
117  vmuleq(val.v[i],val.v[i]);
118  }
119  while(npow>>=1);
120  *resd=res;
121  *ress=Y(Tbconst)(0.);
122  }
123  else
124  {
125  Tb scale=Y(Tbconst)(0.), scaleint=Y(Tbconst)(0.), res=Y(Tbconst)(1.);
126  Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
127  do
128  {
129  if (npow&1)
130  {
131  for (int i=0; i<nvec; ++i)
132  {
133  vmuleq(res.v[i],val.v[i]);
134  vaddeq(scale.v[i],scaleint.v[i]);
135  }
136  Y(Tbnormalize)(&res,&scale,sharp_fbighalf);
137  }
138  for (int i=0; i<nvec; ++i)
139  {
140  vmuleq(val.v[i],val.v[i]);
141  vaddeq(scaleint.v[i],scaleint.v[i]);
142  }
143  Y(Tbnormalize)(&val,&scaleint,sharp_fbighalf);
144  }
145  while(npow>>=1);
146  *resd=res;
147  *ress=scale;
148  }
149  }
150 
151 static inline int Y(rescale) (Tb * restrict lam1, Tb * restrict lam2,
152  Tb * restrict scale)
153  {
154  int did_scale=0;
155  for (int i=0;i<nvec; ++i)
156  {
157  Tm mask = vgt(vabs(lam2->v[i]),vload(sharp_ftol));
158  if (vanyTrue(mask))
159  {
160  did_scale=1;
161  vmuleq_mask(mask,lam1->v[i],vload(sharp_fsmall));
162  vmuleq_mask(mask,lam2->v[i],vload(sharp_fsmall));
163  vaddeq_mask(mask,scale->v[i],vone);
164  }
165  }
166  return did_scale;
167  }
168 
169 static inline int Y(TballLt)(Tb a,double b)
170  {
171  Tv vb=vload(b);
172  Tm res=vlt(a.v[0],vb);
173  for (int i=1; i<nvec; ++i)
174  res=vand_mask(res,vlt(a.v[i],vb));
175  return vallTrue(res);
176  }
177 static inline int Y(TballGt)(Tb a,double b)
178  {
179  Tv vb=vload(b);
180  Tm res=vgt(a.v[0],vb);
181  for (int i=1; i<nvec; ++i)
182  res=vand_mask(res,vgt(a.v[i],vb));
183  return vallTrue(res);
184  }
185 static inline int Y(TballGe)(Tb a,double b)
186  {
187  Tv vb=vload(b);
188  Tm res=vge(a.v[0],vb);
189  for (int i=1; i<nvec; ++i)
190  res=vand_mask(res,vge(a.v[i],vb));
191  return vallTrue(res);
192  }
193 
194 static void Y(getCorfac)(Tb scale, Tb * restrict corfac,
195  const double * restrict cf)
196  {
197  Y(Tbu) sc, corf;
198  sc.b=scale;
199  for (int i=0; i<VLEN*nvec; ++i)
200  corf.s[i] = (sc.s[i]<sharp_minscale) ?
201  0. : cf[(int)(sc.s[i])-sharp_minscale];
202  *corfac=corf.b;
203  }
204 
205 NOINLINE static void Y(iter_to_ieee) (const Tb sth, Tb cth, int *l_,
206  Tb * restrict lam_1_, Tb * restrict lam_2_, Tb * restrict scale_,
207  const sharp_Ylmgen_C * restrict gen)
208  {
209  int l=gen->m;
210  Tb lam_1=Y(Tbconst)(0.), lam_2, scale;
211  Y(mypow) (sth,l,gen->powlimit,&lam_2,&scale);
212  Y(Tbmuleq1) (&lam_2,(gen->m&1) ? -gen->mfac[gen->m]:gen->mfac[gen->m]);
213  Y(Tbnormalize)(&lam_2,&scale,sharp_ftol);
214 
215  int below_limit = Y(TballLt)(scale,sharp_limscale);
216  while (below_limit)
217  {
218  if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
219  for (int i=0; i<nvec; ++i)
220  lam_1.v[i] = vload(gen->rf[l].f[0])*(cth.v[i]*lam_2.v[i])
221  - vload(gen->rf[l].f[1])*lam_1.v[i];
222  for (int i=0; i<nvec; ++i)
223  lam_2.v[i] = vload(gen->rf[l+1].f[0])*(cth.v[i]*lam_1.v[i])
224  - vload(gen->rf[l+1].f[1])*lam_2.v[i];
225  if (Y(rescale)(&lam_1,&lam_2,&scale))
226  below_limit = Y(TballLt)(scale,sharp_limscale);
227  l+=2;
228  }
229  *l_=l; *lam_1_=lam_1; *lam_2_=lam_2; *scale_=scale;
230  }
231 
232 static inline void Y(rec_step) (Tb * restrict rxp, Tb * restrict rxm,
233  Tb * restrict ryp, Tb * restrict rym, const Tb cth,
234  const sharp_ylmgen_dbl3 fx)
235  {
236  Tv fx0=vload(fx.f[0]),fx1=vload(fx.f[1]),fx2=vload(fx.f[2]);
237  for (int i=0; i<nvec; ++i)
238  {
239  rxp->v[i] = (cth.v[i]-fx1)*fx0*ryp->v[i] - fx2*rxp->v[i];
240  rxm->v[i] = (cth.v[i]+fx1)*fx0*rym->v[i] - fx2*rxm->v[i];
241  }
242  }
243 
244 static void Y(iter_to_ieee_spin) (const Tb cth, const Tb sth, int *l_,
245  Tb * rec1p_, Tb * rec1m_, Tb * rec2p_, Tb * rec2m_,
246  Tb * scalep_, Tb * scalem_, const sharp_Ylmgen_C * restrict gen)
247  {
248  const sharp_ylmgen_dbl3 * restrict fx = gen->fx;
249  Tb cth2, sth2;
250  for (int i=0; i<nvec; ++i)
251  {
252  cth2.v[i]=vsqrt(vmul(vadd(vone,cth.v[i]),vload(0.5)));
253  cth2.v[i]=vmax(cth2.v[i],vload(1e-15));
254  sth2.v[i]=vsqrt(vmul(vsub(vone,cth.v[i]),vload(0.5)));
255  sth2.v[i]=vmax(sth2.v[i],vload(1e-15));
256  Tm mask=vlt(sth.v[i],vzero);
257  Tm cmask=vand_mask(mask,vlt(cth.v[i],vzero));
258  vmuleq_mask(cmask,cth2.v[i],vload(-1.));
259  Tm smask=vand_mask(mask,vgt(cth.v[i],vzero));
260  vmuleq_mask(smask,sth2.v[i],vload(-1.));
261  }
262 
263  Tb ccp, ccps, ssp, ssps, csp, csps, scp, scps;
264  Y(mypow)(cth2,gen->cosPow,gen->powlimit,&ccp,&ccps);
265  Y(mypow)(sth2,gen->sinPow,gen->powlimit,&ssp,&ssps);
266  Y(mypow)(cth2,gen->sinPow,gen->powlimit,&csp,&csps);
267  Y(mypow)(sth2,gen->cosPow,gen->powlimit,&scp,&scps);
268 
269  Tb rec2p, rec2m, scalep, scalem;
270  Tb rec1p=Y(Tbconst)(0.), rec1m=Y(Tbconst)(0.);
271  Tv prefac=vload(gen->prefac[gen->m]),
272  prescale=vload(gen->fscale[gen->m]);
273  for (int i=0; i<nvec; ++i)
274  {
275  rec2p.v[i]=vmul(prefac,ccp.v[i]);
276  scalep.v[i]=vadd(prescale,ccps.v[i]);
277  rec2m.v[i]=vmul(prefac,csp.v[i]);
278  scalem.v[i]=vadd(prescale,csps.v[i]);
279  }
280  Y(Tbnormalize)(&rec2m,&scalem,sharp_fbighalf);
281  Y(Tbnormalize)(&rec2p,&scalep,sharp_fbighalf);
282  for (int i=0; i<nvec; ++i)
283  {
284  rec2p.v[i]=vmul(rec2p.v[i],ssp.v[i]);
285  scalep.v[i]=vadd(scalep.v[i],ssps.v[i]);
286  rec2m.v[i]=vmul(rec2m.v[i],scp.v[i]);
287  scalem.v[i]=vadd(scalem.v[i],scps.v[i]);
288  if (gen->preMinus_p)
289  rec2p.v[i]=vneg(rec2p.v[i]);
290  if (gen->preMinus_m)
291  rec2m.v[i]=vneg(rec2m.v[i]);
292  if (gen->s&1)
293  rec2p.v[i]=vneg(rec2p.v[i]);
294  }
295  Y(Tbnormalize)(&rec2m,&scalem,sharp_ftol);
296  Y(Tbnormalize)(&rec2p,&scalep,sharp_ftol);
297 
298  int l=gen->mhi;
299 
300  int below_limit = Y(TballLt)(scalep,sharp_limscale)
301  && Y(TballLt)(scalem,sharp_limscale);
302  while (below_limit)
303  {
304  if (l+2>gen->lmax) {*l_=gen->lmax+1;return;}
305  Y(rec_step)(&rec1p,&rec1m,&rec2p,&rec2m,cth,fx[l+1]);
306  Y(rec_step)(&rec2p,&rec2m,&rec1p,&rec1m,cth,fx[l+2]);
307  if (Y(rescale)(&rec1p,&rec2p,&scalep) | Y(rescale)(&rec1m,&rec2m,&scalem))
308  below_limit = Y(TballLt)(scalep,sharp_limscale)
309  && Y(TballLt)(scalem,sharp_limscale);
310  l+=2;
311  }
312 
313  *l_=l;
314  *rec1p_=rec1p; *rec2p_=rec2p; *scalep_=scalep;
315  *rec1m_=rec1m; *rec2m_=rec2m; *scalem_=scalem;
316  }

Generated on Mon Dec 10 2018 10:24:20 for LevelS SHT library