Commit a573d927 authored by Nathanael Schaeffer @home's avatar Nathanael Schaeffer @home
Browse files

Bugfix : for m=0 dct analysis without sse2 appeared in 72dd5e70725d. We now...

Bugfix : for m=0 dct analysis without sse2 appeared in 72dd5e70725d. We now assume NPHI=1 <=> SHT_AXISYM defined.
parent 16da70e9
......@@ -1631,9 +1631,9 @@ double choose_best_sht(shtns_cfg shtns, int* nlp, int vector, int dct_mtr)
#endif
minc = MMAX/20 + 1; // don't test every single m.
m = -1; i = -1; t0 = 0.0; // reference = no dct.
if (sht_array[SHT_DCT][SHT_TYP_SSY] != NULL)
if (sht_func[SHT_STD][SHT_TYP_SSY][SHT_DCT] != NULL)
t0 += get_time(shtns, *nlp, 2, "s", shtns->fptr[SHT_STD][SHT_TYP_SSY], Qlm, Slm, Tlm, Qh, Sh, Th, LMAX);
if ( (sht_array[SHT_DCT][SHT_TYP_VSY] != NULL) && (vector) )
if ( (sht_func[SHT_STD][SHT_TYP_VSY][SHT_DCT] != NULL) && (vector) )
t0 += get_time(shtns, nloop, 4, "v", shtns->fptr[SHT_STD][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
tnodct = t0;
for (m=0; m<=MMAX; m+=minc) {
......@@ -1641,9 +1641,9 @@ double choose_best_sht(shtns_cfg shtns, int* nlp, int vector, int dct_mtr)
printf("\n\tm=%d :",m);
#endif
Set_MTR_DCT(shtns, m);
t = get_time(shtns, *nlp, 2, "sdct", sht_array[SHT_DCT][SHT_TYP_SSY], Qlm, Slm, Tlm, Qh, Sh, Th, LMAX);
t = get_time(shtns, *nlp, 2, "sdct", sht_func[SHT_STD][SHT_TYP_SSY][SHT_DCT], Qlm, Slm, Tlm, Qh, Sh, Th, LMAX);
if (vector)
t += get_time(shtns, nloop, 4, "vdct", sht_array[SHT_DCT][SHT_TYP_VSY], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
t += get_time(shtns, nloop, 4, "vdct", sht_func[SHT_STD][SHT_TYP_VSY][SHT_DCT], Slm, Tlm, Qlm, Sh, Th, Qh, LMAX);
if (t < t0) { t0 = t; i = m; PRINT_VERB("*"); }
PRINT_DOT
}
......
......@@ -507,7 +507,8 @@ V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
Q BrF -= NLAT*(imlim+1); // restore original pointer
V BtF -= NLAT*(imlim+1); BpF -= NLAT*(imlim+1); // restore original pointer
if (NPHI>1) {
// NPHI > 1 as SHT_AXISYM is not defined.
if (shtns->ncplx_fft >= 0) {
Q fftw_execute_dft_c2r(shtns->ifft, (complex double *) BrF, Vr);
V fftw_execute_dft_c2r(shtns->ifft, (complex double *) BtF, Vt);
V fftw_execute_dft_c2r(shtns->ifft, (complex double *) BpF, Vp);
......@@ -515,14 +516,7 @@ V fftw_execute_dft_c2r(shtns->ifft, (complex double *) BpF, Vp);
Q VFREE(BrF);
VX VFREE(BtF); // this frees also BpF.
}
} else {
k=1; do { // compress complex to real
Q Vr[k] = ((double *)BrF)[2*k];
V Vt[k] = ((double *)BtF)[2*k];
V Vp[k] = ((double *)BpF)[2*k];
k++;
} while(k<NLAT);
}
}
#endif
Q #undef BR0
......
......@@ -87,10 +87,10 @@ V double por[NLAT_2+2*NWAY] SSE;
#ifdef SHT_VAR_LTR
if (MTR*MRES > (int) llim) imlim = ((int) llim)/MRES; // 32bit mul and div should be faster
#endif
Q BrF = (complex double *) Vr;
V BtF = (complex double *) Vt; BpF = (complex double *) Vp;
#ifndef SHT_AXISYM
Q BrF = (complex double *) Vr;
V BtF = (complex double *) Vt; BpF = (complex double *) Vp;
if (shtns->ncplx_fft >= 0) {
if (shtns->ncplx_fft > 0) { // alloc memory for the FFT
QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(complex double) );
......@@ -110,26 +110,29 @@ V l_2 = shtns->l_2;
im = 0; // dzl.p = 0.0 : and evrything is REAL
i=0;
Q double r0 = 0.0;
#ifdef SHT_AXISYM
#define STEP 1
#define XNP *NPHI
#ifndef SHT_AXISYM
Q #define BR0(i) ((double*)BrF)[(i)*2]
S #define BT0(i) ((double*)BtF)[(i)*2]
T #define BP0(i) ((double*)BpF)[(i)*2]
#else
#define STEP 2
#define XNP
Q #define BR0(i) Vr[i]
S #define BT0(i) Vt[i]
T #define BP0(i) Vp[i]
#endif
do { // compute symmetric and antisymmetric parts.
double w = wg[i] XNP;
Q double a = ((double*)BrF)[i*STEP]; double b = ((double*)BrF)[(NLAT-1)*STEP -i*STEP];
double w = wg[i];
Q double a = BR0(i); double b = BR0(NLAT-1-i);
Q ror[i] = (a-b)*w; rer[i] = (a+b)*w;
Q r0 += ((a+b)*w);
V double c = ((double*)BtF)[i*STEP]; double d = ((double*)BtF)[(NLAT-1)*STEP -i*STEP];
V double c = BT0(i); double d = BT0(NLAT-1-i);
V ter[i] = (c+d)*w; tor[i] = (c-d)*w;
V double e = ((double*)BpF)[i*STEP]; double f = ((double*)BpF)[(NLAT-1)*STEP -i*STEP];
V double e = BP0(i); double f = BP0(NLAT-1-i);
V per[i] = (e+f)*w; por[i] = (e-f)*w;
i++;
} while(i<ni);
#undef XNP
#undef STEP
} while(i<ni);
Q #undef BR0
S #undef BT0
T #undef BP0
do {
Q rer[i] = 0.0; ror[i] = 0.0;
V ter[i] = 0.0; tor[i] = 0.0;
......@@ -140,8 +143,6 @@ V per[i] = 0.0; por[i] = 0.0;
#else
} while(i<ni+NWAY-1);
#endif
Q BrF += NLAT;
V BtF += NLAT; BpF += NLAT;
Q Qlm[0] = r0 * shtns->bl0[0]; // l=0 is done.
V Slm[0] = 0.0; Tlm[0] = 0.0; // l=0 is zero for the vector transform.
k = 0;
......@@ -218,6 +219,8 @@ V Slm[l] = 0.0; Tlm[l] = 0.0;
#ifndef SHT_AXISYM
for (im=1;im<=imlim;im++) {
Q BrF += NLAT;
V BtF += NLAT; BpF += NLAT;
i0 = shtns->tm[im];
m = im*MRES;
#if _GCC_VEC_
......@@ -257,8 +260,6 @@ V per[i] = 0.0; pei[i] = 0.0; por[i] = 0.0; poi[i] = 0.0;
#else
} while (i < ni+NWAY-1);
#endif
Q BrF += NLAT;
V BtF += NLAT; BpF += NLAT;
k=i0; // i0 must be even.
#if _GCC_VEC_
......@@ -532,8 +533,8 @@ V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
#endif
if (shtns->ncplx_fft > 0) { // free memory
Q VFREE(BrF - NLAT*(imlim+1));
VX VFREE(BtF - NLAT*(imlim+1)); // this frees also BpF.
Q VFREE(BrF - NLAT*imlim);
VX VFREE(BtF - NLAT*imlim); // this frees also BpF.
}
#endif
......
......@@ -517,7 +517,7 @@ V BtF[k] = vdup(0.0); BpF[k] = vdup(0.0);
Q BrF -= NLAT*(imlim+1); // restore original pointer
V BtF -= NLAT*(imlim+1); BpF -= NLAT*(imlim+1); // restore original pointer
if (NPHI>1) {
// NPHI > 1 as SHT_AXISYM is not defined.
#ifndef SHT_NO_DCT
Q fftw_execute_r2r(shtns->idct,(double *) BrF, (double *) BrF); // iDCT
V fftw_execute_r2r(shtns->idct,(double *) BtF, (double *) BtF); // iDCT
......@@ -548,6 +548,7 @@ V k+=2;
V } while(k<NLAT);
V }
#endif
if (shtns->ncplx_fft >= 0) {
Q fftw_execute_dft_c2r(shtns->ifft, (complex double *) BrF, Vr);
V fftw_execute_dft_c2r(shtns->ifft, (complex double *) BtF, Vt);
V fftw_execute_dft_c2r(shtns->ifft, (complex double *) BpF, Vp);
......@@ -555,32 +556,7 @@ V fftw_execute_dft_c2r(shtns->ifft, (complex double *) BpF, Vp);
Q VFREE(BrF);
VX VFREE(BtF); // this frees also BpF.
}
} else {
k=1; do { // compress complex to real
Q Vr[k] = ((double *)BrF)[2*k];
V Vt[k] = ((double *)BtF)[2*k];
V Vp[k] = ((double *)BpF)[2*k];
k++;
} while(k<NLAT);
#ifndef SHT_NO_DCT
Q fftw_execute_r2r(shtns->idct_r1,Vr, Vr); // iDCT m=0
S fftw_execute_r2r(shtns->idct_r1,Vt, Vt); // iDCT m=0
T fftw_execute_r2r(shtns->idct_r1,Vp, Vp); // iDCT m=0
V double* st_1 = shtns->st_1;
V k=0; do {
V #ifdef _GCC_VEC_
V v2d sin_1 = ((v2d *)st_1)[k];
S ((v2d *)Vt)[k] *= sin_1;
T ((v2d *)Vp)[k] *= sin_1;
V #else
V double sin_1 = st_1[2*k]; double sin_2 = st_1[2*k+1];
S Vt[2*k] *= sin_1; Vt[2*k+1] *= sin_2;
T Vp[2*k] *= sin_1; Vp[2*k+1] *= sin_2;
V #endif
V k++;
V } while (k<NLAT_2);
#endif
}
}
#endif
Q #undef BR0
......
......@@ -48,8 +48,6 @@ S void GEN3(spat_to_SHsph_,ID_NME,SUFFIX)(shtns_cfg shtns, double *Vt, complex d
T void GEN3(spat_to_SHtor_,ID_NME,SUFFIX)(shtns_cfg shtns, double *Vp, complex double *Tlm, long int llim) {
#endif
Q complex double *BrF; // contains the Fourier transformed data
V complex double *BtF, *BpF; // contains the Fourier transformed data
Q double *zl;
V double *dzl0;
V struct DtDp *dzl;
......@@ -59,6 +57,8 @@ Q complex double q0,q1;
S complex double s0,s1;
T complex double t0,t1;
#ifndef SHT_AXISYM
Q complex double *BrF; // contains the Fourier transformed data
V complex double *BtF, *BpF; // contains the Fourier transformed data
Q v2d reo[2*NLAT_2]; // symmetric (even) and anti-symmetric (odd) parts, interleaved.
V v2d tpeo[4*NLAT_2]; // theta and phi even and odd parts
Q #define reo0 ((double*)reo)
......@@ -99,11 +99,11 @@ Q #define ro0(i) reo0[2*(i)]
#ifdef SHT_VAR_LTR
if (MTR*MRES > (int) llim) imlim = ((int) llim)/MRES; // 32bit mul and div should be faster
#endif
Q BrF = (complex double *) Vr;
S BtF = (complex double *) Vt;
T BpF = (complex double *) Vp;
#ifndef SHT_AXISYM
Q BrF = (complex double *) Vr;
S BtF = (complex double *) Vt;
T BpF = (complex double *) Vp;
if (shtns->ncplx_fft >= 0) {
if (shtns->ncplx_fft > 0) { // alloc memory for the FFT
QX BrF = VMALLOC( shtns->ncplx_fft * sizeof(complex double) );
......@@ -125,11 +125,10 @@ V double* st_1 = shtns->st_1;
Q #define BR0 ((double *)reo)
V #define BT0 ((double *)tpeo)
V #define BP0 ((double *)tpeo + NLAT)
V l = (NPHI==1) ? 1 : 2; // stride of source data.
V i=0; i0=0; do {
V i=0; do { // we assume NPHI>1 (else SHT_AXISYM should be defined).
V double sin_1 = st_1[i];
V ((double *)BtF)[i0] *= sin_1; ((double *)BpF)[i0] *= sin_1;
V i++; i0+=l;
V ((double *)BtF)[i*2] *= sin_1; ((double *)BpF)[i*2] *= sin_1;
V i++;
V } while (i<NLAT);
Q fftw_execute_r2r(shtns->dct_m0,(double *) BrF, BR0); // DCT out-of-place.
V fftw_execute_r2r(shtns->dct_m0,(double *) BtF, BT0); // DCT out-of-place.
......@@ -138,29 +137,21 @@ V fftw_execute_r2r(shtns->dct_m0,(double *) BpF, BP0); // DCT out-of-place.
Q #define BR0 reo0
S #define BT0 teo0
T #define BP0 peo0
Q if (NPHI > 1) {
Q s2d np = vdup(NPHI);
Q i=0; do {
Q ((v2d*) BrF)[i] *= np;
Q i++;
Q } while (i<ni);
Q }
V i=0; do {
V s2d np = vdup(NPHI);
V #ifdef _GCC_VEC_
V s2d sin_1 = ((s2d *)st_1)[i] * np;
S ((v2d*) BtF)[i] *= sin_1;
T ((v2d*) BpF)[i] *= sin_1;
V s2d sin_1 = ((s2d *)st_1)[i];
S ((s2d*) Vt)[i] *= sin_1;
T ((s2d*) Vp)[i] *= sin_1;
V #else
V double sin_1 = st_1[2*i] * np; double sin_2 = st_1[2*i+1] * np;
S BT0[2*i] *= sin_1; BT0[2*i+1] *= sin_2;
T BP0[2*i] *= sin_1; BP0[2*i+1] *= sin_2;
V double sin_1 = st_1[2*i]; double sin_2 = st_1[2*i+1];
S Vt[2*i] *= sin_1; Vt[2*i+1] *= sin_2;
T Vp[2*i] *= sin_1; Vp[2*i+1] *= sin_2;
V #endif
V i++;
V } while (i<ni);
Q fftw_execute_r2r(shtns->dct_r1,(double *) BrF, BR0); // DCT out-of-place.
S fftw_execute_r2r(shtns->dct_r1,(double *) BtF, BT0); // DCT out-of-place.
T fftw_execute_r2r(shtns->dct_r1,(double *) BpF, BP0); // DCT out-of-place.
Q fftw_execute_r2r(shtns->dct_r1,Vr, BR0); // DCT out-of-place.
S fftw_execute_r2r(shtns->dct_r1,Vt, BT0); // DCT out-of-place.
T fftw_execute_r2r(shtns->dct_r1,Vp, BP0); // DCT out-of-place.
#endif
l=0;
long int klim = shtns->klim;
......@@ -254,61 +245,56 @@ Q ((complex double *) Ql)[l] = q0;
Q #undef BR0
S #undef BT0
T #undef BP0
#ifdef SHT_VAR_LTR
#ifdef SHT_VAR_LTR
while( l<=LMAX ) {
Q Ql[l] = vdup(0.0);
S Sl[l] = vdup(0.0);
T Tl[l] = vdup(0.0);
l++;
}
#endif
Q BrF += NLAT;
S BtF += NLAT;
T BpF += NLAT;
#endif
#else // ifndef SHT_NO_DCT
i=0;
QE double r0 = 0.0;
QX double r1 = 0.0;
Q zl = shtns->zlm[0];
// stride of source data. => now we assume NPHI>1 else SHT_AXISYM is defined.
// stride of source data : we assume NPHI>1 (else SHT_AXISYM should be defined).
#ifndef SHT_AXISYM
#define STEP 2
#define XNP
Q #define BR0(i) vdup(((double*)BrF)[(i)*2])
S #define BT0(i) vdup(((double*)BtF)[(i)*2])
T #define BP0(i) vdup(((double*)BpF)[(i)*2])
#else
#define STEP 1
#define XNP *np
Q #define BR0(i) vdup(Vr[i])
S #define BT0(i) vdup(Vt[i])
T #define BP0(i) vdup(Vp[i])
#endif
do { // compute symmetric and antisymmetric parts.
Q s2d a = BR0(i); s2d b = BR0(NLAT-1-i);
S s2d c = BT0(i); s2d d = BT0(NLAT-1-i);
T s2d e = BP0(i); s2d f = BP0(NLAT-1-i);
#if _GCC_VEC_ && __SSE3__
s2d np = vdup(NPHI);
Q s2d a = vdup(((double*)BrF)[i*STEP]); s2d b = vdup(((double*)BrF)[(NLAT-1-i)*STEP]);
Q a = subadd(a,b) XNP;
Q a = subadd(a,b);
Q ((s2d*) reo0)[i] = a; // assume odd is first, then even.
Q r0 += zl[i] * vhi_to_dbl(a); // even part is used.
S s2d c = vdup(((double*)BtF)[i*STEP]); s2d d = vdup(((double*)BtF)[(NLAT-1-i)*STEP]);
S c = subadd(c,d) XNP; vteo0(i) = vxchg(c);
T s2d e = vdup(((double*)BpF)[i*STEP]); s2d f = vdup(((double*)BpF)[(NLAT-1-i)*STEP]);
T e = subadd(e,f) XNP; vpeo0(i) = vxchg(e);
S c = subadd(c,d); vteo0(i) = vxchg(c);
T e = subadd(e,f); vpeo0(i) = vxchg(e);
i++;
QX s2d g = vdup(((double*)BrF)[i*STEP]); s2d h = vdup(((double*)BrF)[(NLAT-1-i)*STEP]);
QX g = subadd(g,h) XNP;
QX s2d g = BR0(i); s2d h = BR0(NLAT-1-i);
QX g = subadd(g,h);
QX ((s2d*) reo0)[i] = g; // assume odd is first, then even.
QX r1 += zl[i] * vhi_to_dbl(g); // even part is used, reduce data dependency
QX i++;
#else
double np = NPHI;
Q double a = ((double*)BrF)[i*STEP]; double b = ((double*)BrF)[(NLAT-1-i)*STEP];
Q ro0(i) = (a-b) XNP; re0(i) = (a+b) XNP;
Q r0 += zl[i] * ((a+b) XNP);
S double c = ((double*)BtF)[i*STEP]; double d = ((double*)BtF)[(NLAT-1-i)*STEP];
S te0(i) = (c+d) XNP; to0(i) = (c-d) XNP;
T double e = ((double*)BpF)[i*STEP]; double f = ((double*)BpF)[(NLAT-1-i)*STEP];
T pe0(i) = (e+f) XNP; po0(i) = (e-f) XNP;
Q ro0(i) = (a-b); re0(i) = (a+b);
Q r0 += zl[i] * (a+b);
S te0(i) = (c+d); to0(i) = (c-d);
T pe0(i) = (e+f); po0(i) = (e-f);
i++;
#endif
} while(i<ni);
#undef STEP
#undef XNP
Q #undef BR0
S #undef BT0
T #undef BP0
QX r0 += r1;
QX ro0(ni) = 0.0; re0(ni) = 0.0; // allow some overflow.
Q zl += ni + (ni&1); // SSE alignement
......@@ -317,9 +303,6 @@ Q v2d* Ql = (v2d*) Qlm; // virtual pointer for l=0 and im
S v2d* Sl = (v2d*) Slm; // virtual pointer for l=0 and im
T v2d* Tl = (v2d*) Tlm; // virtual pointer for l=0 and im
V dzl0 = (double *) shtns->dzlm[0]; // only theta derivative (d/dphi = 0 for m=0)
Q BrF += NLAT;
S BtF += NLAT;
T BpF += NLAT;
Q ((complex double *)Ql)[0] = r0;
S Sl[0] = vdup(0.0); // l=0 is zero for the vector transform.
T Tl[0] = vdup(0.0); // l=0 is zero for the vector transform.
......@@ -412,6 +395,8 @@ T Tl[l] = vdup(0.0);
#endif // ifndef SHT_NO_DCT
#ifndef SHT_AXISYM
for (im=1;im<=imlim;im++) {
Q BrF += NLAT;
V BtF += NLAT; BpF += NLAT;
i0 = shtns->tm[im];
i=i0;
do { // compute symmetric and antisymmetric parts.
......@@ -429,8 +414,6 @@ V v2d* Sl = (v2d*) &Slm[l]; v2d* Tl = (v2d*) &Tlm[l];
3 double m_1 = 1.0/l;
Q zl = shtns->zlm[im];
V dzl = shtns->dzlm[im];
Q BrF += NLAT;
V BtF += NLAT; BpF += NLAT;
while (l<llim) { // ops : NLAT/2 * (2*(LMAX-m+1) + 4) : almost twice as fast.
Q v2d q0 = vdup(0.0);
Q v2d q1 = vdup(0.0);
......@@ -503,8 +486,8 @@ V ((v2d*)Slm)[l] = vdup(0.0); ((v2d*)Tlm)[l] = vdup(0.0);
#endif
if (shtns->ncplx_fft > 0) { // free memory
Q VFREE(BrF - NLAT*(imlim+1));
VX VFREE(BtF - NLAT*(imlim+1)); // this frees also BpF.
Q VFREE(BrF - NLAT*imlim);
VX VFREE(BtF - NLAT*imlim); // this frees also BpF.
}
#endif
......
......@@ -3,19 +3,32 @@
echo "beginning test suite" > test_suite.log
for switch in "-quickinit" "-gauss" "-reg" "-fly"
for switch in "" "-oop" "-transpose" "-schmidt" "-4pi"
do
for lmax in 1 2 3 4 11 12 13 14 31 32 33 34 121 122 123 124
for mode in "-quickinit" "-gauss" "-reg" "-fly"
do
for mmax in 0 1 $lmax
for lmax in 1 2 3 4 11 12 13 14 31 32 33 34 121 122 123 124
do
c="./time_SHT $lmax -mmax=$mmax $switch -iter=1"
echo $c
echo "---" >> test_suite.log
echo "*** $c *** " >> test_suite.log
$c > tmp.out
cat tmp.out | grep ERROR
cat tmp.out >> test_suite.log
for mmax in 0 1 $lmax
do
c="./time_SHT $lmax -mmax=$mmax $mode $switch -iter=1"
echo $c
echo "---" >> test_suite.log
echo "*** $c *** " >> test_suite.log
$c > tmp.out
cat tmp.out | grep ERROR
cat tmp.out >> test_suite.log
done
done
done
done
# do also a huge transform :
c="./time_SHT 2047 -mres=15 -quickinit -iter=1"
echo $c
echo "---" >> test_suite.log
echo "*** $c *** " >> test_suite.log
$c > tmp.out
cat tmp.out | grep ERROR
cat tmp.out >> test_suite.log
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment