LevelS SHT library  3.50
sharp.c
Go to the documentation of this file.
1 /*
2  * This file is part of libsharp.
3  *
4  * libsharp is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * libsharp is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with libsharp; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /*
20  * libsharp is being developed at the Max-Planck-Institut fuer Astrophysik
21  * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
22  * (DLR).
23  */
24 
25 /*! \file sharp.c
26  * Spherical transform library
27  *
28  * Copyright (C) 2006-2016 Max-Planck-Society
29  * \author Martin Reinecke \author Dag Sverre Seljebotn
30  */
31 
32 #include <math.h>
33 #include <string.h>
34 #include "pocketfft/pocketfft.h"
35 #include "sharp_ylmgen_c.h"
36 #include "sharp_internal.h"
37 #include "c_utils.h"
38 #include "sharp_core.h"
39 #include "sharp_vecutil.h"
40 #include "walltime_c.h"
41 #include "sharp_almhelpers.h"
42 #include "sharp_geomhelpers.h"
43 
44 typedef complex double dcmplx;
45 typedef complex float fcmplx;
46 
47 static const double sqrt_one_half = 0.707106781186547572737310929369;
48 static const double sqrt_two = 1.414213562373095145474621858739;
49 
50 static int chunksize_min=500, nchunks_max=10;
51 
52 static void get_chunk_info (int ndata, int nmult, int *nchunks, int *chunksize)
53  {
54  *chunksize = (ndata+nchunks_max-1)/nchunks_max;
55  if (*chunksize>=chunksize_min) // use max number of chunks
56  *chunksize = ((*chunksize+nmult-1)/nmult)*nmult;
57  else // need to adjust chunksize and nchunks
58  {
59  *nchunks = (ndata+chunksize_min-1)/chunksize_min;
60  *chunksize = (ndata+(*nchunks)-1)/(*nchunks);
61  if (*nchunks>1)
62  *chunksize = ((*chunksize+nmult-1)/nmult)*nmult;
63  }
64  *nchunks = (ndata+(*chunksize)-1)/(*chunksize);
65  }
66 
67 NOINLINE int sharp_get_mlim (int lmax, int spin, double sth, double cth)
68  {
69  double ofs=lmax*0.01;
70  if (ofs<100.) ofs=100.;
71  double b = -2*spin*fabs(cth);
72  double t1 = lmax*sth+ofs;
73  double c = (double)spin*spin-t1*t1;
74  double discr = b*b-4*c;
75  if (discr<=0) return lmax;
76  double res=(-b+sqrt(discr))/2.;
77  if (res>lmax) res=lmax;
78  return (int)(res+0.5);
79  }
80 
81 typedef struct
82  {
83  double phi0_;
84  dcmplx *shiftarr;
85  int s_shift;
86  rfft_plan plan;
87  int length;
88  int norot;
89  } ringhelper;
90 
91 static void ringhelper_init (ringhelper *self)
92  {
93  static ringhelper rh_null = { 0, NULL, 0, NULL, 0, 0 };
94  *self = rh_null;
95  }
96 
97 static void ringhelper_destroy (ringhelper *self)
98  {
99  if (self->plan) destroy_rfft_plan(self->plan);
100  DEALLOC(self->shiftarr);
101  ringhelper_init(self);
102  }
103 
104 NOINLINE static void ringhelper_update (ringhelper *self, int nph, int mmax, double phi0)
105  {
106  self->norot = (fabs(phi0)<1e-14);
107  if (!(self->norot))
108  if ((mmax!=self->s_shift-1) || (!FAPPROX(phi0,self->phi0_,1e-12)))
109  {
110  RESIZE (self->shiftarr,dcmplx,mmax+1);
111  self->s_shift = mmax+1;
112  self->phi0_ = phi0;
113  for (int m=0; m<=mmax; ++m)
114  self->shiftarr[m] = cos(m*phi0) + _Complex_I*sin(m*phi0);
115 // double *tmp=(double *) self->shiftarr;
116 // sincos_multi (mmax+1, phi0, &tmp[1], &tmp[0], 2);
117  }
118  if (!self->plan) self->plan=make_rfft_plan(nph);
119  if (nph!=(int)self->length)
120  {
121  destroy_rfft_plan(self->plan);
122  self->plan=make_rfft_plan(nph);
123  self->length=nph;
124  }
125  }
126 
127 static int ringinfo_compare (const void *xa, const void *xb)
128  {
129  const sharp_ringinfo *a=xa, *b=xb;
130  return (a->sth < b->sth) ? -1 : (a->sth > b->sth) ? 1 : 0;
131  }
132 static int ringpair_compare (const void *xa, const void *xb)
133  {
134  const sharp_ringpair *a=xa, *b=xb;
135 // return (a->r1.sth < b->r1.sth) ? -1 : (a->r1.sth > b->r1.sth) ? 1 : 0;
136  if (a->r1.nph==b->r1.nph)
137  return (a->r1.phi0 < b->r1.phi0) ? -1 :
138  ((a->r1.phi0 > b->r1.phi0) ? 1 :
139  (a->r1.cth>b->r1.cth ? -1 : 1));
140  return (a->r1.nph<b->r1.nph) ? -1 : 1;
141  }
142 
143 void sharp_make_general_alm_info (int lmax, int nm, int stride, const int *mval,
144  const ptrdiff_t *mstart, int flags, sharp_alm_info **alm_info)
145  {
146  sharp_alm_info *info = RALLOC(sharp_alm_info,1);
147  info->lmax = lmax;
148  info->nm = nm;
149  info->mval = RALLOC(int,nm);
150  info->mvstart = RALLOC(ptrdiff_t,nm);
151  info->stride = stride;
152  info->flags = flags;
153  for (int mi=0; mi<nm; ++mi)
154  {
155  info->mval[mi] = mval[mi];
156  info->mvstart[mi] = mstart[mi];
157  }
158  *alm_info = info;
159  }
160 
161 void sharp_make_alm_info (int lmax, int mmax, int stride,
162  const ptrdiff_t *mstart, sharp_alm_info **alm_info)
163  {
164  int *mval=RALLOC(int,mmax+1);
165  for (int i=0; i<=mmax; ++i)
166  mval[i]=i;
167  sharp_make_general_alm_info (lmax, mmax+1, stride, mval, mstart, 0, alm_info);
168  DEALLOC(mval);
169  }
170 
171 ptrdiff_t sharp_alm_index (const sharp_alm_info *self, int l, int mi)
172  {
173  UTIL_ASSERT(!(self->flags & SHARP_PACKED),
174  "sharp_alm_index not applicable with SHARP_PACKED alms");
175  return self->mvstart[mi]+self->stride*l;
176  }
177 
178 ptrdiff_t sharp_alm_count(const sharp_alm_info *self)
179  {
180  ptrdiff_t result=0;
181  for (int im=0; im!=self->nm; ++im)
182  {
183  int m=self->mval[im];
184  ptrdiff_t x=(self->lmax + 1 - m);
185  if ((m!=0)&&((self->flags&SHARP_PACKED)!=0)) result+=2*x;
186  else result+=x;
187  }
188  return result;
189  }
190 
191 void sharp_destroy_alm_info (sharp_alm_info *info)
192  {
193  DEALLOC (info->mval);
194  DEALLOC (info->mvstart);
195  DEALLOC (info);
196  }
197 
198 void sharp_make_geom_info (int nrings, const int *nph, const ptrdiff_t *ofs,
199  const int *stride, const double *phi0, const double *theta,
200  const double *wgt, sharp_geom_info **geom_info)
201  {
202  sharp_geom_info *info = RALLOC(sharp_geom_info,1);
203  sharp_ringinfo *infos = RALLOC(sharp_ringinfo,nrings);
204 
205  int pos=0;
206  info->pair=RALLOC(sharp_ringpair,nrings);
207  info->npairs=0;
208  info->nphmax=0;
209  *geom_info = info;
210 
211  for (int m=0; m<nrings; ++m)
212  {
213  infos[m].theta = theta[m];
214  infos[m].cth = cos(theta[m]);
215  infos[m].sth = sin(theta[m]);
216  infos[m].weight = (wgt != NULL) ? wgt[m] : 1.;
217  infos[m].phi0 = phi0[m];
218  infos[m].ofs = ofs[m];
219  infos[m].stride = stride[m];
220  infos[m].nph = nph[m];
221  if (info->nphmax<nph[m]) info->nphmax=nph[m];
222  }
223  qsort(infos,nrings,sizeof(sharp_ringinfo),ringinfo_compare);
224  while (pos<nrings)
225  {
226  info->pair[info->npairs].r1=infos[pos];
227  if ((pos<nrings-1) && FAPPROX(infos[pos].cth,-infos[pos+1].cth,1e-12))
228  {
229  if (infos[pos].cth>0) // make sure northern ring is in r1
230  info->pair[info->npairs].r2=infos[pos+1];
231  else
232  {
233  info->pair[info->npairs].r1=infos[pos+1];
234  info->pair[info->npairs].r2=infos[pos];
235  }
236  ++pos;
237  }
238  else
239  info->pair[info->npairs].r2.nph=-1;
240  ++pos;
241  ++info->npairs;
242  }
243  DEALLOC(infos);
244 
245  qsort(info->pair,info->npairs,sizeof(sharp_ringpair),ringpair_compare);
246  }
247 
248 ptrdiff_t sharp_map_size(const sharp_geom_info *info)
249  {
250  ptrdiff_t result = 0;
251  for (int m=0; m<info->npairs; ++m)
252  {
253  result+=info->pair[m].r1.nph;
254  result+=(info->pair[m].r2.nph>=0) ? (info->pair[m].r2.nph) : 0;
255  }
256  return result;
257  }
258 
259 void sharp_destroy_geom_info (sharp_geom_info *geom_info)
260  {
261  DEALLOC (geom_info->pair);
262  DEALLOC (geom_info);
263  }
264 
265 /* This currently requires all m values from 0 to nm-1 to be present.
266  It might be worthwhile to relax this criterion such that holes in the m
267  distribution are permissible. */
268 static int sharp_get_mmax (int *mval, int nm)
269  {
270  //FIXME: if gaps are allowed, we have to search the maximum m in the array
271  int *mcheck=RALLOC(int,nm);
272  SET_ARRAY(mcheck,0,nm,0);
273  for (int i=0; i<nm; ++i)
274  {
275  int m_cur=mval[i];
276  UTIL_ASSERT((m_cur>=0) && (m_cur<nm), "not all m values are present");
277  UTIL_ASSERT(mcheck[m_cur]==0, "duplicate m value");
278  mcheck[m_cur]=1;
279  }
280  DEALLOC(mcheck);
281  return nm-1;
282  }
283 
284 NOINLINE static void ringhelper_phase2ring (ringhelper *self,
285  const sharp_ringinfo *info, double *data, int mmax, const dcmplx *phase,
286  int pstride, int flags)
287  {
288  int nph = info->nph;
289 
290  ringhelper_update (self, nph, mmax, info->phi0);
291 
292  double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1.;
293  if (flags&SHARP_REAL_HARMONICS)
294  wgt *= sqrt_one_half;
295 
296  if (nph>=2*mmax+1)
297  {
298  if (self->norot)
299  for (int m=0; m<=mmax; ++m)
300  {
301  data[2*m]=creal(phase[m*pstride])*wgt;
302  data[2*m+1]=cimag(phase[m*pstride])*wgt;
303  }
304  else
305  for (int m=0; m<=mmax; ++m)
306  {
307  dcmplx tmp = phase[m*pstride]*self->shiftarr[m];
308  data[2*m]=creal(tmp)*wgt;
309  data[2*m+1]=cimag(tmp)*wgt;
310  }
311  for (int m=2*(mmax+1); m<nph+2; ++m)
312  data[m]=0.;
313  }
314  else
315  {
316  data[0]=creal(phase[0])*wgt;
317  SET_ARRAY(data,1,nph+2,0.);
318 
319  int idx1=1, idx2=nph-1;
320  for (int m=1; m<=mmax; ++m)
321  {
322  dcmplx tmp = phase[m*pstride]*wgt;
323  if(!self->norot) tmp*=self->shiftarr[m];
324  if (idx1<(nph+2)/2)
325  {
326  data[2*idx1]+=creal(tmp);
327  data[2*idx1+1]+=cimag(tmp);
328  }
329  if (idx2<(nph+2)/2)
330  {
331  data[2*idx2]+=creal(tmp);
332  data[2*idx2+1]-=cimag(tmp);
333  }
334  if (++idx1>=nph) idx1=0;
335  if (--idx2<0) idx2=nph-1;
336  }
337  }
338  data[1]=data[0];
339  rfft_backward (self->plan, &(data[1]), 1.);
340  }
341 
342 NOINLINE static void ringhelper_ring2phase (ringhelper *self,
343  const sharp_ringinfo *info, double *data, int mmax, dcmplx *phase,
344  int pstride, int flags)
345  {
346  int nph = info->nph;
347 #if 1
348  int maxidx = mmax; /* Enable this for traditional Healpix compatibility */
349 #else
350  int maxidx = IMIN(nph-1,mmax);
351 #endif
352 
353  ringhelper_update (self, nph, mmax, -info->phi0);
354  double wgt = (flags&SHARP_USE_WEIGHTS) ? info->weight : 1;
355  if (flags&SHARP_REAL_HARMONICS)
356  wgt *= sqrt_two;
357 
358  rfft_forward (self->plan, &(data[1]), 1.);
359  data[0]=data[1];
360  data[1]=data[nph+1]=0.;
361 
362  if (maxidx<=nph/2)
363  {
364  if (self->norot)
365  for (int m=0; m<=maxidx; ++m)
366  phase[m*pstride] = (data[2*m] + _Complex_I*data[2*m+1]) * wgt;
367  else
368  for (int m=0; m<=maxidx; ++m)
369  phase[m*pstride] =
370  (data[2*m] + _Complex_I*data[2*m+1]) * self->shiftarr[m] * wgt;
371  }
372  else
373  {
374  for (int m=0; m<=maxidx; ++m)
375  {
376  int idx=m%nph;
377  dcmplx val;
378  if (idx<(nph-idx))
379  val = (data[2*idx] + _Complex_I*data[2*idx+1]) * wgt;
380  else
381  val = (data[2*(nph-idx)] - _Complex_I*data[2*(nph-idx)+1]) * wgt;
382  if (!self->norot)
383  val *= self->shiftarr[m];
384  phase[m*pstride]=val;
385  }
386  }
387 
388  for (int m=maxidx+1;m<=mmax; ++m)
389  phase[m*pstride]=0.;
390  }
391 
392 NOINLINE static void clear_map (const sharp_geom_info *ginfo, void *map,
393  int flags)
394  {
395  if (flags & SHARP_NO_FFT)
396  {
397  for (int j=0;j<ginfo->npairs;++j)
398  {
399  if (flags&SHARP_DP)
400  {
401  for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
402  ((dcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
403  for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
404  ((dcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
405  }
406  else
407  {
408  for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
409  ((fcmplx *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
410  for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
411  ((fcmplx *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
412  }
413  }
414  }
415  else
416  {
417  if (flags&SHARP_DP)
418  {
419  for (int j=0;j<ginfo->npairs;++j)
420  {
421  double *dmap=(double *)map;
422  if (ginfo->pair[j].r1.stride==1)
423  memset(&dmap[ginfo->pair[j].r1.ofs],0,
424  ginfo->pair[j].r1.nph*sizeof(double));
425  else
426  for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
427  dmap[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
428  if ((ginfo->pair[j].r2.nph>0)&&(ginfo->pair[j].r2.stride==1))
429  memset(&dmap[ginfo->pair[j].r2.ofs],0,
430  ginfo->pair[j].r2.nph*sizeof(double));
431  else
432  for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
433  dmap[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
434  }
435  }
436  else
437  {
438  for (int j=0;j<ginfo->npairs;++j)
439  {
440  for (ptrdiff_t i=0;i<ginfo->pair[j].r1.nph;++i)
441  ((float *)map)[ginfo->pair[j].r1.ofs+i*ginfo->pair[j].r1.stride]=0;
442  for (ptrdiff_t i=0;i<ginfo->pair[j].r2.nph;++i)
443  ((float *)map)[ginfo->pair[j].r2.ofs+i*ginfo->pair[j].r2.stride]=0;
444  }
445  }
446  }
447  }
448 
449 NOINLINE static void clear_alm (const sharp_alm_info *ainfo, void *alm,
450  int flags)
451  {
452 #define CLEARLOOP(real_t,body) \
453  { \
454  real_t *talm = (real_t *)alm; \
455  for (int l=m;l<=ainfo->lmax;++l) \
456  body \
457  }
458 
459  for (int mi=0;mi<ainfo->nm;++mi)
460  {
461  int m=ainfo->mval[mi];
462  ptrdiff_t mvstart = ainfo->mvstart[mi];
463  ptrdiff_t stride = ainfo->stride;
464  if (!(ainfo->flags&SHARP_PACKED))
465  mvstart*=2;
466  if ((ainfo->flags&SHARP_PACKED)&&(m==0))
467  {
468  if (flags&SHARP_DP)
469  CLEARLOOP(double, talm[mvstart+l*stride] = 0.;)
470  else
471  CLEARLOOP(float, talm[mvstart+l*stride] = 0.;)
472  }
473  else
474  {
475  stride*=2;
476  if (flags&SHARP_DP)
477  CLEARLOOP(double,talm[mvstart+l*stride]=talm[mvstart+l*stride+1]=0.;)
478  else
479  CLEARLOOP(float,talm[mvstart+l*stride]=talm[mvstart+l*stride+1]=0.;)
480  }
481 
482 #undef CLEARLOOP
483  }
484  }
485 
486 NOINLINE static void init_output (sharp_job *job)
487  {
488  if (job->flags&SHARP_ADD) return;
489  if (job->type == SHARP_MAP2ALM)
490  for (int i=0; i<job->ntrans*job->nalm; ++i)
491  clear_alm (job->ainfo,job->alm[i],job->flags);
492  else
493  for (int i=0; i<job->ntrans*job->nmaps; ++i)
494  clear_map (job->ginfo,job->map[i],job->flags);
495  }
496 
497 NOINLINE static void alloc_phase (sharp_job *job, int nm, int ntheta)
498  {
499  if (job->type==SHARP_MAP2ALM)
500  {
501  job->s_m=2*job->ntrans*job->nmaps;
502  if (((job->s_m*16*nm)&1023)==0) nm+=3; // hack to avoid critical strides
503  job->s_th=job->s_m*nm;
504  }
505  else
506  {
507  job->s_th=2*job->ntrans*job->nmaps;
508  if (((job->s_th*16*ntheta)&1023)==0) ntheta+=3; // hack to avoid critical strides
509  job->s_m=job->s_th*ntheta;
510  }
511  job->phase=RALLOC(dcmplx,2*job->ntrans*job->nmaps*nm*ntheta);
512  }
513 
514 static void dealloc_phase (sharp_job *job)
515  { DEALLOC(job->phase); }
516 
517 static void alloc_almtmp (sharp_job *job, int lmax)
518  { job->almtmp=RALLOC(dcmplx,job->ntrans*job->nalm*(lmax+1)); }
519 
520 static void dealloc_almtmp (sharp_job *job)
521  { DEALLOC(job->almtmp); }
522 
523 NOINLINE static void alm2almtmp (sharp_job *job, int lmax, int mi)
524  {
525 
526 #define COPY_LOOP(real_t, source_t, expr_of_x) \
527  { \
528  for (int l=m; l<lmin; ++l) \
529  for (int i=0; i<job->ntrans*job->nalm; ++i) \
530  job->almtmp[job->ntrans*job->nalm*l+i] = 0; \
531  for (int l=lmin; l<=lmax; ++l) \
532  for (int i=0; i<job->ntrans*job->nalm; ++i) \
533  { \
534  source_t x = *(source_t *)(((real_t *)job->alm[i])+ofs+l*stride); \
535  job->almtmp[job->ntrans*job->nalm*l+i] = expr_of_x; \
536  } \
537  }
538 
539  if (job->type!=SHARP_MAP2ALM)
540  {
541  ptrdiff_t ofs=job->ainfo->mvstart[mi];
542  int stride=job->ainfo->stride;
543  int m=job->ainfo->mval[mi];
544  int lmin=(m<job->spin) ? job->spin : m;
545  /* in the case of SHARP_REAL_HARMONICS, phase2ring scales all the
546  coefficients by sqrt_one_half; here we must compensate to avoid scaling
547  m=0 */
548  double norm_m0=(job->flags&SHARP_REAL_HARMONICS) ? sqrt_two : 1.;
549  if (!(job->ainfo->flags&SHARP_PACKED))
550  ofs *= 2;
551  if (!((job->ainfo->flags&SHARP_PACKED)&&(m==0)))
552  stride *= 2;
553  if (job->spin==0)
554  {
555  if (m==0)
556  {
557  if (job->flags&SHARP_DP)
558  COPY_LOOP(double, double, x*norm_m0)
559  else
560  COPY_LOOP(float, float, x*norm_m0)
561  }
562  else
563  {
564  if (job->flags&SHARP_DP)
565  COPY_LOOP(double, dcmplx, x)
566  else
567  COPY_LOOP(float, fcmplx, x)
568  }
569  }
570  else
571  {
572  if (m==0)
573  {
574  if (job->flags&SHARP_DP)
575  COPY_LOOP(double, double, x*job->norm_l[l]*norm_m0)
576  else
577  COPY_LOOP(float, float, x*job->norm_l[l]*norm_m0)
578  }
579  else
580  {
581  if (job->flags&SHARP_DP)
582  COPY_LOOP(double, dcmplx, x*job->norm_l[l])
583  else
584  COPY_LOOP(float, fcmplx, x*job->norm_l[l])
585  }
586  }
587  }
588  else
589  memset (job->almtmp+job->ntrans*job->nalm*job->ainfo->mval[mi], 0,
590  job->ntrans*job->nalm*(lmax+1-job->ainfo->mval[mi])*sizeof(dcmplx));
591 
592 #undef COPY_LOOP
593  }
594 
595 NOINLINE static void almtmp2alm (sharp_job *job, int lmax, int mi)
596  {
597 
598 #define COPY_LOOP(real_t, target_t, expr_of_x) \
599  for (int l=lmin; l<=lmax; ++l) \
600  for (int i=0; i<job->ntrans*job->nalm; ++i) \
601  { \
602  dcmplx x = job->almtmp[job->ntrans*job->nalm*l+i]; \
603  *(target_t *)(((real_t *)job->alm[i])+ofs+l*stride) += expr_of_x; \
604  }
605 
606  if (job->type != SHARP_MAP2ALM) return;
607  ptrdiff_t ofs=job->ainfo->mvstart[mi];
608  int stride=job->ainfo->stride;
609  int m=job->ainfo->mval[mi];
610  int lmin=(m<job->spin) ? job->spin : m;
611  /* in the case of SHARP_REAL_HARMONICS, ring2phase scales all the
612  coefficients by sqrt_two; here we must compensate to avoid scaling
613  m=0 */
614  double norm_m0=(job->flags&SHARP_REAL_HARMONICS) ? sqrt_one_half : 1.;
615  if (!(job->ainfo->flags&SHARP_PACKED))
616  ofs *= 2;
617  if (!((job->ainfo->flags&SHARP_PACKED)&&(m==0)))
618  stride *= 2;
619  if (job->spin==0)
620  {
621  if (m==0)
622  {
623  if (job->flags&SHARP_DP)
624  COPY_LOOP(double, double, creal(x)*norm_m0)
625  else
626  COPY_LOOP(float, float, crealf(x)*norm_m0)
627  }
628  else
629  {
630  if (job->flags&SHARP_DP)
631  COPY_LOOP(double, dcmplx, x)
632  else
633  COPY_LOOP(float, fcmplx, (fcmplx)x)
634  }
635  }
636  else
637  {
638  if (m==0)
639  {
640  if (job->flags&SHARP_DP)
641  COPY_LOOP(double, double, creal(x)*job->norm_l[l]*norm_m0)
642  else
643  COPY_LOOP(float, fcmplx, (float)(creal(x)*job->norm_l[l]*norm_m0))
644  }
645  else
646  {
647  if (job->flags&SHARP_DP)
648  COPY_LOOP(double, dcmplx, x*job->norm_l[l])
649  else
650  COPY_LOOP(float, fcmplx, (fcmplx)(x*job->norm_l[l]))
651  }
652  }
653 
654 #undef COPY_LOOP
655  }
656 
657 NOINLINE static void ringtmp2ring (sharp_job *job, sharp_ringinfo *ri,
658  const double *ringtmp, int rstride)
659  {
660  if (job->flags & SHARP_DP)
661  {
662  double **dmap = (double **)job->map;
663  for (int i=0; i<job->ntrans*job->nmaps; ++i)
664  {
665  double *restrict p1=&dmap[i][ri->ofs];
666  const double *restrict p2=&ringtmp[i*rstride+1];
667  if (ri->stride==1)
668  {
669  if (job->flags&SHARP_ADD)
670  for (int m=0; m<ri->nph; ++m)
671  p1[m] += p2[m];
672  else
673  memcpy(p1,p2,ri->nph*sizeof(double));
674  }
675  else
676  for (int m=0; m<ri->nph; ++m)
677  p1[m*ri->stride] += p2[m];
678  }
679  }
680  else
681  {
682  float **fmap = (float **)job->map;
683  for (int i=0; i<job->ntrans*job->nmaps; ++i)
684  for (int m=0; m<ri->nph; ++m)
685  fmap[i][ri->ofs+m*ri->stride] += (float)ringtmp[i*rstride+m+1];
686  }
687  }
688 
689 NOINLINE static void ring2ringtmp (sharp_job *job, sharp_ringinfo *ri,
690  double *ringtmp, int rstride)
691  {
692  if (job->flags & SHARP_DP)
693  for (int i=0; i<job->ntrans*job->nmaps; ++i)
694  {
695  double *restrict p1=&ringtmp[i*rstride+1],
696  *restrict p2=&(((double *)(job->map[i]))[ri->ofs]);
697  if (ri->stride==1)
698  memcpy(p1,p2,ri->nph*sizeof(double));
699  else
700  for (int m=0; m<ri->nph; ++m)
701  p1[m] = p2[m*ri->stride];
702  }
703  else
704  for (int i=0; i<job->ntrans*job->nmaps; ++i)
705  for (int m=0; m<ri->nph; ++m)
706  ringtmp[i*rstride+m+1] = ((float *)(job->map[i]))[ri->ofs+m*ri->stride];
707  }
708 
709 static void ring2phase_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
710  dcmplx *phase)
711  {
712  if (ri->nph<0)
713  {
714  for (int i=0; i<job->ntrans*job->nmaps; ++i)
715  for (int m=0; m<=mmax; ++m)
716  phase[2*i+job->s_m*m]=0.;
717  }
718  else
719  {
720  UTIL_ASSERT(ri->nph==mmax+1,"bad ring size");
721  double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
722  if (job->flags&SHARP_REAL_HARMONICS)
723  wgt *= sqrt_two;
724  for (int i=0; i<job->ntrans*job->nmaps; ++i)
725  for (int m=0; m<=mmax; ++m)
726  phase[2*i+job->s_m*m]= (job->flags & SHARP_DP) ?
727  ((dcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt :
728  ((fcmplx *)(job->map[i]))[ri->ofs+m*ri->stride]*wgt;
729  }
730  }
731 static void phase2ring_direct (sharp_job *job, sharp_ringinfo *ri, int mmax,
732  dcmplx *phase)
733  {
734  if (ri->nph<0) return;
735  UTIL_ASSERT(ri->nph==mmax+1,"bad ring size");
736  dcmplx **dmap = (dcmplx **)job->map;
737  fcmplx **fmap = (fcmplx **)job->map;
738  double wgt = (job->flags&SHARP_USE_WEIGHTS) ? (ri->nph*ri->weight) : 1.;
739  if (job->flags&SHARP_REAL_HARMONICS)
740  wgt *= sqrt_one_half;
741  for (int i=0; i<job->ntrans*job->nmaps; ++i)
742  for (int m=0; m<=mmax; ++m)
743  if (job->flags & SHARP_DP)
744  dmap[i][ri->ofs+m*ri->stride] += wgt*phase[2*i+job->s_m*m];
745  else
746  fmap[i][ri->ofs+m*ri->stride] += (fcmplx)(wgt*phase[2*i+job->s_m*m]);
747  }
748 
749 //FIXME: set phase to zero if not SHARP_MAP2ALM?
750 NOINLINE static void map2phase (sharp_job *job, int mmax, int llim, int ulim)
751  {
752  if (job->type != SHARP_MAP2ALM) return;
753  int pstride = job->s_m;
754  if (job->flags & SHARP_NO_FFT)
755  {
756  for (int ith=llim; ith<ulim; ++ith)
757  {
758  int dim2 = job->s_th*(ith-llim);
759  ring2phase_direct(job,&(job->ginfo->pair[ith].r1),mmax,
760  &(job->phase[dim2]));
761  ring2phase_direct(job,&(job->ginfo->pair[ith].r2),mmax,
762  &(job->phase[dim2+1]));
763  }
764  }
765  else
766  {
767 #pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
768 {
769  ringhelper helper;
770  ringhelper_init(&helper);
771  int rstride=job->ginfo->nphmax+2;
772  double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
773 #pragma omp for schedule(dynamic,1)
774  for (int ith=llim; ith<ulim; ++ith)
775  {
776  int dim2 = job->s_th*(ith-llim);
777  ring2ringtmp(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
778  for (int i=0; i<job->ntrans*job->nmaps; ++i)
779  ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r1),
780  &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
781  if (job->ginfo->pair[ith].r2.nph>0)
782  {
783  ring2ringtmp(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
784  for (int i=0; i<job->ntrans*job->nmaps; ++i)
785  ringhelper_ring2phase (&helper,&(job->ginfo->pair[ith].r2),
786  &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
787  }
788  }
789  DEALLOC(ringtmp);
790  ringhelper_destroy(&helper);
791 } /* end of parallel region */
792  }
793  }
794 
795 NOINLINE static void phase2map (sharp_job *job, int mmax, int llim, int ulim)
796  {
797  if (job->type == SHARP_MAP2ALM) return;
798  int pstride = job->s_m;
799  if (job->flags & SHARP_NO_FFT)
800  {
801  for (int ith=llim; ith<ulim; ++ith)
802  {
803  int dim2 = job->s_th*(ith-llim);
804  phase2ring_direct(job,&(job->ginfo->pair[ith].r1),mmax,
805  &(job->phase[dim2]));
806  phase2ring_direct(job,&(job->ginfo->pair[ith].r2),mmax,
807  &(job->phase[dim2+1]));
808  }
809  }
810  else
811  {
812 #pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
813 {
814  ringhelper helper;
815  ringhelper_init(&helper);
816  int rstride=job->ginfo->nphmax+2;
817  double *ringtmp=RALLOC(double,job->ntrans*job->nmaps*rstride);
818 #pragma omp for schedule(dynamic,1)
819  for (int ith=llim; ith<ulim; ++ith)
820  {
821  int dim2 = job->s_th*(ith-llim);
822  for (int i=0; i<job->ntrans*job->nmaps; ++i)
823  ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r1),
824  &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i],pstride,job->flags);
825  ringtmp2ring(job,&(job->ginfo->pair[ith].r1),ringtmp,rstride);
826  if (job->ginfo->pair[ith].r2.nph>0)
827  {
828  for (int i=0; i<job->ntrans*job->nmaps; ++i)
829  ringhelper_phase2ring (&helper,&(job->ginfo->pair[ith].r2),
830  &ringtmp[i*rstride],mmax,&job->phase[dim2+2*i+1],pstride,job->flags);
831  ringtmp2ring(job,&(job->ginfo->pair[ith].r2),ringtmp,rstride);
832  }
833  }
834  DEALLOC(ringtmp);
835  ringhelper_destroy(&helper);
836 } /* end of parallel region */
837  }
838  }
839 
840 NOINLINE static void sharp_execute_job (sharp_job *job)
841  {
842  double timer=wallTime();
843  job->opcnt=0;
844  int lmax = job->ainfo->lmax,
845  mmax=sharp_get_mmax(job->ainfo->mval, job->ainfo->nm);
846 
847  job->norm_l = (job->type==SHARP_ALM2MAP_DERIV1) ?
848  sharp_Ylmgen_get_d1norm (lmax) :
849  sharp_Ylmgen_get_norm (lmax, job->spin);
850 
851 /* clear output arrays if requested */
852  init_output (job);
853 
854  int nchunks, chunksize;
855  get_chunk_info(job->ginfo->npairs,(job->flags&SHARP_NVMAX)*VLEN,&nchunks,
856  &chunksize);
857 //FIXME: needs to be changed to "nm"
858  alloc_phase (job,mmax+1,chunksize);
859 
860 /* chunk loop */
861  for (int chunk=0; chunk<nchunks; ++chunk)
862  {
863  int llim=chunk*chunksize, ulim=IMIN(llim+chunksize,job->ginfo->npairs);
864  int *ispair = RALLOC(int,ulim-llim);
865  int *mlim = RALLOC(int,ulim-llim);
866  double *cth = RALLOC(double,ulim-llim), *sth = RALLOC(double,ulim-llim);
867  for (int i=0; i<ulim-llim; ++i)
868  {
869  ispair[i] = job->ginfo->pair[i+llim].r2.nph>0;
870  cth[i] = job->ginfo->pair[i+llim].r1.cth;
871  sth[i] = job->ginfo->pair[i+llim].r1.sth;
872  mlim[i] = sharp_get_mlim(lmax, job->spin, sth[i], cth[i]);
873  }
874 
875 /* map->phase where necessary */
876  map2phase (job, mmax, llim, ulim);
877 
878 #pragma omp parallel if ((job->flags&SHARP_NO_OPENMP)==0)
879 {
880  sharp_job ljob = *job;
881  ljob.opcnt=0;
882  sharp_Ylmgen_C generator;
883  sharp_Ylmgen_init (&generator,lmax,mmax,ljob.spin);
884  alloc_almtmp(&ljob,lmax);
885 
886 #pragma omp for schedule(dynamic,1)
887  for (int mi=0; mi<job->ainfo->nm; ++mi)
888  {
889 /* alm->alm_tmp where necessary */
890  alm2almtmp (&ljob, lmax, mi);
891 
892  inner_loop (&ljob, ispair, cth, sth, llim, ulim, &generator, mi, mlim);
893 
894 /* alm_tmp->alm where necessary */
895  almtmp2alm (&ljob, lmax, mi);
896  }
897 
898  sharp_Ylmgen_destroy(&generator);
899  dealloc_almtmp(&ljob);
900 
901 #pragma omp critical
902  job->opcnt+=ljob.opcnt;
903 } /* end of parallel region */
904 
905 /* phase->map where necessary */
906  phase2map (job, mmax, llim, ulim);
907 
908  DEALLOC(ispair);
909  DEALLOC(mlim);
910  DEALLOC(cth);
911  DEALLOC(sth);
912  } /* end of chunk loop */
913 
914  DEALLOC(job->norm_l);
915  dealloc_phase (job);
916  job->time=wallTime()-timer;
917  }
918 
919 static void sharp_build_job_common (sharp_job *job, sharp_jobtype type,
920  int spin, void *alm, void *map, const sharp_geom_info *geom_info,
921  const sharp_alm_info *alm_info, int ntrans, int flags)
922  {
923  UTIL_ASSERT((ntrans>0)&&(ntrans<=SHARP_MAXTRANS),
924  "bad number of simultaneous transforms");
925  if (type==SHARP_ALM2MAP_DERIV1) spin=1;
926  if (type==SHARP_MAP2ALM) flags|=SHARP_USE_WEIGHTS;
927  if (type==SHARP_Yt) type=SHARP_MAP2ALM;
928  if (type==SHARP_WY) { type=SHARP_ALM2MAP; flags|=SHARP_USE_WEIGHTS; }
929 
930  UTIL_ASSERT((spin>=0)&&(spin<=alm_info->lmax), "bad spin");
931  job->type = type;
932  job->spin = spin;
933  job->norm_l = NULL;
934  job->nmaps = (type==SHARP_ALM2MAP_DERIV1) ? 2 : ((spin>0) ? 2 : 1);
935  job->nalm = (type==SHARP_ALM2MAP_DERIV1) ? 1 : ((spin>0) ? 2 : 1);
936  job->ginfo = geom_info;
937  job->ainfo = alm_info;
938  job->flags = flags;
939  if ((job->flags&SHARP_NVMAX)==0)
940  job->flags|=sharp_nv_oracle (type, spin, ntrans);
941  if (alm_info->flags&SHARP_REAL_HARMONICS)
942  job->flags|=SHARP_REAL_HARMONICS;
943  job->time = 0.;
944  job->opcnt = 0;
945  job->ntrans = ntrans;
946  job->alm=alm;
947  job->map=map;
948  }
949 
950 void sharp_execute (sharp_jobtype type, int spin, void *alm, void *map,
951  const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans,
952  int flags, double *time, unsigned long long *opcnt)
953  {
954  sharp_job job;
955  sharp_build_job_common (&job, type, spin, alm, map, geom_info, alm_info,
956  ntrans, flags);
957 
958  sharp_execute_job (&job);
959  if (time!=NULL) *time = job.time;
960  if (opcnt!=NULL) *opcnt = job.opcnt;
961  }
962 
963 void sharp_set_chunksize_min(int new_chunksize_min)
964  { chunksize_min=new_chunksize_min; }
965 void sharp_set_nchunks_max(int new_nchunks_max)
966  { nchunks_max=new_nchunks_max; }
967 
968 int sharp_get_nv_max (void)
969 { return 6; }
970 
971 static int sharp_oracle (sharp_jobtype type, int spin, int ntrans)
972  {
973  int lmax=511;
974  int mmax=(lmax+1)/2;
975  int nrings=(lmax+1)/4;
976  int ppring=1;
977 
978  spin = (spin!=0) ? 2 : 0;
979 
980  ptrdiff_t npix=(ptrdiff_t)nrings*ppring;
981  sharp_geom_info *tinfo;
982  sharp_make_gauss_geom_info (nrings, ppring, 0., 1, ppring, &tinfo);
983 
984  ptrdiff_t nalms = ((mmax+1)*(mmax+2))/2 + (mmax+1)*(lmax-mmax);
985  int ncomp = ntrans*((spin==0) ? 1 : 2);
986 
987  double **map;
988  ALLOC2D(map,double,ncomp,npix);
989  SET_ARRAY(map[0],0,npix*ncomp,0.);
990 
991  sharp_alm_info *alms;
992  sharp_make_triangular_alm_info(lmax,mmax,1,&alms);
993 
994  dcmplx **alm;
995  ALLOC2D(alm,dcmplx,ncomp,nalms);
996  SET_ARRAY(alm[0],0,nalms*ncomp,0.);
997 
998  double time=1e30;
999  int nvbest=-1;
1000 
1001  for (int nv=1; nv<=sharp_get_nv_max(); ++nv)
1002  {
1003  double time_acc=0.;
1004  double jtime;
1005  int ntries=0;
1006  do
1007  {
1008  sharp_execute(type,spin,&alm[0],&map[0],tinfo,alms,ntrans,
1009  nv|SHARP_DP|SHARP_NO_OPENMP,&jtime,NULL);
1010 
1011  if (jtime<time) { time=jtime; nvbest=nv; }
1012  time_acc+=jtime;
1013  ++ntries;
1014  }
1015  while ((time_acc<0.02)&&(ntries<2));
1016  }
1017 
1018  DEALLOC2D(map);
1019  DEALLOC2D(alm);
1020 
1021  sharp_destroy_alm_info(alms);
1022  sharp_destroy_geom_info(tinfo);
1023  return nvbest;
1024  }
1025 
1026 int sharp_nv_oracle (sharp_jobtype type, int spin, int ntrans)
1027  {
1028  static const int maxtr = 6;
1029  static int nv_opt[6][2][5] = {
1030  {{0,0,0,0,0},{0,0,0,0,0}},
1031  {{0,0,0,0,0},{0,0,0,0,0}},
1032  {{0,0,0,0,0},{0,0,0,0,0}},
1033  {{0,0,0,0,0},{0,0,0,0,0}},
1034  {{0,0,0,0,0},{0,0,0,0,0}},
1035  {{0,0,0,0,0},{0,0,0,0,0}} };
1036 
1037  if (type==SHARP_ALM2MAP_DERIV1) spin=1;
1038  UTIL_ASSERT(type<5,"bad type");
1039  UTIL_ASSERT((ntrans>0),"bad number of simultaneous transforms");
1040  UTIL_ASSERT(spin>=0, "bad spin");
1041  ntrans=IMIN(ntrans,maxtr);
1042 
1043  if (nv_opt[ntrans-1][spin!=0][type]==0)
1044  nv_opt[ntrans-1][spin!=0][type]=sharp_oracle(type,spin,ntrans);
1045  return nv_opt[ntrans-1][spin!=0][type];
1046  }
#define RALLOC(type, num)
void sharp_Ylmgen_destroy(sharp_Ylmgen_C *gen)
double * sharp_Ylmgen_get_norm(int lmax, int spin)
void sharp_execute(sharp_jobtype type, int spin, void *alm, void *map, const sharp_geom_info *geom_info, const sharp_alm_info *alm_info, int ntrans, int flags, double *time, unsigned long long *opcnt)
Definition: sharp.c:950
void sharp_destroy_geom_info(sharp_geom_info *geom_info)
Definition: sharp.c:259
sharp_jobtype
void sharp_make_general_alm_info(int lmax, int nm, int stride, const int *mval, const ptrdiff_t *mstart, int flags, sharp_alm_info **alm_info)
Definition: sharp.c:143
#define SET_ARRAY(ptr, i1, i2, val)
double * sharp_Ylmgen_get_d1norm(int lmax)
void sharp_make_triangular_alm_info(int lmax, int mmax, int stride, sharp_alm_info **alm_info)
void sharp_destroy_alm_info(sharp_alm_info *info)
Definition: sharp.c:191
ptrdiff_t sharp_map_size(const sharp_geom_info *info)
Definition: sharp.c:248
void sharp_make_geom_info(int nrings, const int *nph, const ptrdiff_t *ofs, const int *stride, const double *phi0, const double *theta, const double *wgt, sharp_geom_info **geom_info)
Definition: sharp.c:198
void sharp_make_gauss_geom_info(int nrings, int nphi, double phi0, int stride_lon, int stride_lat, sharp_geom_info **geom_info)
#define DEALLOC(ptr)
ptrdiff_t sharp_alm_index(const sharp_alm_info *self, int l, int mi)
Definition: sharp.c:171
double wallTime(void)
#define UTIL_ASSERT(cond, msg)
void sharp_make_alm_info(int lmax, int mmax, int stride, const ptrdiff_t *mstart, sharp_alm_info **alm_info)
Definition: sharp.c:161
void sharp_Ylmgen_init(sharp_Ylmgen_C *gen, int l_max, int m_max, int spin)
ptrdiff_t sharp_alm_count(const sharp_alm_info *self)
Definition: sharp.c:178

Generated on Mon Dec 10 2018 10:24:20 for LevelS SHT library