Skip to content

Instantly share code, notes, and snippets.

@rouault
Created June 23, 2015 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rouault/3bbc31c9f12391d79920 to your computer and use it in GitHub Desktop.
Save rouault/3bbc31c9f12391d79920 to your computer and use it in GitHub Desktop.
Code from PJ_tmer.c + pj_mlfn.c ported to SSE2
// Download sleef 2.80
// Put this file as testprojsse2.cpp into sleef-2.80/simd directory
// and compile with g++ -DENABLE_SSE2 -O2 testprojsse2.cpp sleefsimddp.c -I. -o testprojsse2
#include <math.h>
#include <stdio.h>
extern "C"
{
#if defined(ENABLE_SSE2)
#include "sleefsimd.h"
#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__)
#define INLINE __attribute__((always_inline))
#elif defined(_MSC_VER)
#define INLINE __inline
#else
#define INLINE inline
#endif
#include "helpersse2.h"
#endif
}
#if defined(_MSC_VER)
#define ALIGN(x) _CRT_ALIGN(x)
#else
#define ALIGN(x) __attribute__ ((aligned (x)))
#endif
#ifndef M_PI
# define M_PI 3.14159265358979323846 /* pi */
#endif
#define HALFPI (M_PI/2)
#define EPS10 1.e-10
#define FC1 1.
#define FC2 .5
#define FC3 .16666666666666666666
#define FC4 .08333333333333333333
#define FC5 .05
#define FC6 .03333333333333333333
#define FC7 .02380952380952380952
#define FC8 .01785714285714285714
#define EN_SIZE 5
#define C00 1.
#define C02 .25
#define C04 .046875
#define C06 .01953125
#define C08 .01068115234375
#define C22 .75
#define C44 .46875
#define C46 .01302083333333333333
#define C48 .00712076822916666666
#define C66 .36458333333333333333
#define C68 .00569661458333333333
#define C88 .3076171875
#define EPS 1e-11
#define MAX_ITER 10
void compute_mlfn_coeffs(double* en, double es)
{
double t;
en[0] = C00 - es * (C02 + es * (C04 + es * (C06 + es * C08)));
en[1] = es * (C22 - es * (C04 + es * (C06 + es * C08)));
en[2] = (t = es * es) * (C44 - es * (C46 + es * C48));
en[3] = (t *= es) * (C66 - es * C68);
en[4] = t * es * C88;
}
typedef struct { double x, y; } XY;
typedef struct { double lam, phi; } LP;
#define L(x) XMMReg2Double::Load(x)
class XMMReg2Double
{
public:
__m128d xmm;
XMMReg2Double() {}
XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
static inline XMMReg2Double Load(double d)
{
XMMReg2Double reg;
reg.xmm = _mm_set1_pd(d);
return reg;
}
static inline XMMReg2Double Load(double low, double high)
{
XMMReg2Double reg;
reg.xmm = _mm_set_pd(high, low);
return reg;
}
inline const XMMReg2Double& operator= (const XMMReg2Double& other)
{
xmm = other.xmm;
return *this;
}
inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
{
xmm = _mm_add_pd(xmm, other.xmm);
return *this;
}
inline XMMReg2Double operator+ (const XMMReg2Double& other) const
{
XMMReg2Double ret;
ret.xmm = _mm_add_pd(xmm, other.xmm);
return ret;
}
inline XMMReg2Double operator- (const XMMReg2Double& other) const
{
XMMReg2Double ret;
ret.xmm = _mm_sub_pd(xmm, other.xmm);
return ret;
}
inline XMMReg2Double operator* (const XMMReg2Double& other) const
{
XMMReg2Double ret;
ret.xmm = _mm_mul_pd(xmm, other.xmm);
return ret;
}
inline XMMReg2Double operator/ (const XMMReg2Double& other) const
{
XMMReg2Double ret;
ret.xmm = _mm_div_pd(xmm, other.xmm);
return ret;
}
inline const XMMReg2Double& operator-= (const XMMReg2Double& other)
{
xmm = _mm_sub_pd(xmm, other.xmm);
return *this;
}
inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
{
xmm = _mm_mul_pd(xmm, other.xmm);
return *this;
}
inline const XMMReg2Double& operator*= (const double& other)
{
xmm = _mm_mul_pd(xmm, L(other).xmm);
return *this;
}
inline XMMReg2Double sqrt() const
{
XMMReg2Double ret;
ret.xmm = _mm_sqrt_pd(xmm);
return ret;
}
inline XMMReg2Double fabs() const
{
XMMReg2Double ret;
ret.xmm = _mm_andnot_pd(_mm_set_pd(-0.0,-0.0), xmm);
return ret;
}
inline double castToDouble() const
{
double val;
_mm_store_sd(&val, xmm);
return val;
}
inline double low() const
{
return castToDouble();
}
inline double high() const
{
XMMReg2Double tmp;
tmp.xmm = _mm_unpackhi_pd(xmm, xmm);
return tmp.castToDouble();
}
friend vmask operator<= (const XMMReg2Double& reg1, const XMMReg2Double& reg2);
friend vmask operator< (const XMMReg2Double& reg1, const XMMReg2Double& reg2);
friend vmask operator>= (const XMMReg2Double& reg1, const XMMReg2Double& reg2);
friend vmask operator> (const XMMReg2Double& reg1, const XMMReg2Double& reg2);
friend vmask operator<= (const XMMReg2Double& reg1, const double& reg2);
friend vmask operator< (const XMMReg2Double& reg1, const double& reg2);
friend vmask operator>= (const XMMReg2Double& reg1, const double& reg2);
friend vmask operator> (const XMMReg2Double& reg1, const double& reg2);
friend XMMReg2Double operator+ (const XMMReg2Double& reg1, double val2);
friend XMMReg2Double operator- (const XMMReg2Double& reg1, double val2);
friend XMMReg2Double operator* (const XMMReg2Double& reg1, double val2);
friend XMMReg2Double operator+ (double val1, const XMMReg2Double& reg2);
friend XMMReg2Double operator- (double val1, const XMMReg2Double& reg2);
friend XMMReg2Double operator* (double val1, const XMMReg2Double& reg2);
friend XMMReg2Double operator/ (double val1, const XMMReg2Double& reg2);
};
vmask operator<= (const XMMReg2Double& reg1, const XMMReg2Double& reg2)
{
return _mm_castpd_si128(_mm_cmple_pd(reg1.xmm, reg2.xmm));
}
vmask operator< (const XMMReg2Double& reg1, const XMMReg2Double& reg2)
{
return _mm_castpd_si128(_mm_cmplt_pd(reg1.xmm, reg2.xmm));
}
vmask operator>= (const XMMReg2Double& reg1, const XMMReg2Double& reg2)
{
return _mm_castpd_si128(_mm_cmpge_pd(reg1.xmm, reg2.xmm));
}
vmask operator> (const XMMReg2Double& reg1, const XMMReg2Double& reg2)
{
return _mm_castpd_si128(_mm_cmpgt_pd(reg1.xmm, reg2.xmm));
}
vmask operator<= (const XMMReg2Double& reg1, const double& reg2)
{
return _mm_castpd_si128(_mm_cmple_pd(reg1.xmm, L(reg2).xmm));
}
vmask operator< (const XMMReg2Double& reg1, const double& reg2)
{
return _mm_castpd_si128(_mm_cmplt_pd(reg1.xmm, L(reg2).xmm));
}
vmask operator>= (const XMMReg2Double& reg1, const double& reg2)
{
return _mm_castpd_si128(_mm_cmpge_pd(reg1.xmm, L(reg2).xmm));
}
vmask operator> (const XMMReg2Double& reg1, const double& reg2)
{
return _mm_castpd_si128(_mm_cmpgt_pd(reg1.xmm, L(reg2).xmm));
}
XMMReg2Double operator+ (const XMMReg2Double& reg1, double val2)
{
XMMReg2Double ret;
ret = reg1 + L(val2);
return ret;
}
XMMReg2Double operator- (const XMMReg2Double& reg1, double val2)
{
XMMReg2Double ret;
ret = reg1 - L(val2);
return ret;
}
XMMReg2Double operator* (const XMMReg2Double& reg1, double val2)
{
XMMReg2Double ret;
ret = reg1 * L(val2);
return ret;
}
XMMReg2Double operator+ (double val1, const XMMReg2Double& reg2)
{
XMMReg2Double ret;
ret = L(val1) + reg2;
return ret;
}
XMMReg2Double operator- (double val1, const XMMReg2Double& reg2)
{
XMMReg2Double ret;
ret = L(val1) - reg2;
return ret;
}
XMMReg2Double operator* (double val1, const XMMReg2Double& reg2)
{
XMMReg2Double ret;
ret = L(val1) * reg2;
return ret;
}
XMMReg2Double operator/ (double val1, const XMMReg2Double& reg2)
{
XMMReg2Double ret;
ret = L(val1) / reg2;
return ret;
}
static inline XMMReg2Double ternary(vmask cond, const XMMReg2Double& val_true, const XMMReg2Double& val_false)
{
XMMReg2Double ret;
ret.xmm = vsel_vd_vm_vd_vd(cond, val_true.xmm, val_false.xmm);
return ret;
}
static inline XMMReg2Double ternary(vmask cond, const XMMReg2Double& val_true, const double& val_false)
{
XMMReg2Double ret;
ret.xmm = vsel_vd_vm_vd_vd(cond, val_true.xmm, L(val_false).xmm);
return ret;
}
static inline XMMReg2Double ternary(vmask cond, const double& val_true, const XMMReg2Double& val_false)
{
XMMReg2Double ret;
ret.xmm = vsel_vd_vm_vd_vd(cond, L(val_true).xmm, val_false.xmm);
return ret;
}
static inline XMMReg2Double sqrt(const XMMReg2Double& x)
{
return x.sqrt();
}
static inline XMMReg2Double fabs(const XMMReg2Double& x)
{
return x.fabs();
}
static inline bool TrueForAllMembers(const vmask& cond)
{
return _mm_movemask_pd(vreinterpret_vd_vm(cond)) == 3;
}
double
pj_mlfn(double phi, double sphi, double cphi, double *en)
{
cphi *= sphi;
sphi *= sphi;
return(en[0] * phi - cphi * (en[1] + sphi*(en[2]
+ sphi*(en[3] + sphi*en[4]))));
}
XMMReg2Double inline
pj_mlfn(const XMMReg2Double& phi, const XMMReg2Double& sphi_, const XMMReg2Double& cphi_, const XMMReg2Double* en)
{
XMMReg2Double cphi(cphi_);
XMMReg2Double sphi(sphi_);
cphi *= sphi;
sphi *= sphi;
return(en[0] * phi - cphi * (en[1] + sphi*(en[2]
+ sphi*(en[3] + sphi*en[4]))));
}
void inline sincos(const XMMReg2Double& d, XMMReg2Double* sind, XMMReg2Double* cosd)
{
#ifdef SLOW
// x87 variant
double sinphi_low, cosphi_low, sinphi_high, cosphi_high;
sincos(d.low(), &sinphi_low, &cosphi_low);
sincos(d.high(), &sinphi_high, &cosphi_high);
*sind = XMMReg2Double::Load(sinphi_low, sinphi_high);
*cosd = XMMReg2Double::Load(cosphi_low, cosphi_high);
#else
// sleef xsincos()
vdouble2 sincosres = xsincos((vdouble)d.xmm);
sind->xmm = sincosres.x;
cosd->xmm = sincosres.y;
#endif
}
XMMReg2Double inline
pj_inv_mlfn(const XMMReg2Double* en, const XMMReg2Double& arg, const XMMReg2Double& es,
const XMMReg2Double& _1_div_1_m_es)
{
XMMReg2Double s, t, phi, k = _1_div_1_m_es;
int i;
phi = arg;
for (i = MAX_ITER; i ; --i) /* rarely goes over 2 iterations */
{
XMMReg2Double s, cphi;
sincos(phi, &s, &cphi);
t = 1 - es * s * s;
phi -= t = (pj_mlfn(phi, s, cphi, en) - arg) * (t * sqrt(t)) * k;
if( TrueForAllMembers(fabs(t) < L(EPS)) )
return phi;
}
//pj_ctx_set_errno( ctx, -17 );
return phi;
}
int main(int argc, char* argv[])
{
double y0 = 0;
double x0 = 500000.;
double lam0 = -3;
double k0 = 0.9996;
double inv_k0 = 1. / k0;
double phi0 = 0.;
double a = 6378137;
double ra = 1./a;
double invf = 298.257223563;
double f = 1 / invf;
double es = 2 *f - f*f;
double en[EN_SIZE];
double ml0, esp;
int i;
double acc = 0;
double _1_div_1_m_es = 1. / (1.-es);
compute_mlfn_coeffs(en, es);
ml0 = pj_mlfn(phi0, sin(phi0), cos(phi0), en);
esp = es / (1. - es);
XMMReg2Double ALIGN(16) v_en[5];
for(int j=0;j<5;j++)
v_en[j] = XMMReg2Double::Load(en[j]);
#define NITER (100*1000*1000)
for(i=0;i<NITER;i+=2)
{
LP lp;
XY xy;
XMMReg2Double phi, lam, n, con, cosphi, invcosphi, d, ds, sinphi, t;
XMMReg2Double xy_x, xy_y;
xy_x = XMMReg2Double::Load((1000000.*i) / NITER, (1000000.*(i+1)) / NITER);
xy_y = XMMReg2Double::Load((9000000.*i) / NITER, (9000000.*(i+1)) / NITER);
//xy_x = XMMReg2Double::Load(573142.012282725-x0,573142.012282725-x0);
//xy_y = XMMReg2Double::Load(5427937.52346616,5427937.52346616);
xy_x *= ra;
xy_y *= ra;
phi = pj_inv_mlfn(v_en, ml0 + xy_y * inv_k0, L(es), L(_1_div_1_m_es) );
vmask at_pole = fabs(phi) >= HALFPI;
sincos(phi, &sinphi, &cosphi);
invcosphi = 1.0 / cosphi;
t = ternary( fabs(cosphi) > 1e-10, sinphi * invcosphi, 0 );
n = esp * cosphi * cosphi;
d = xy_x * sqrt(con = 1. - es * sinphi * sinphi) * inv_k0;
con *= t;
t *= t;
ds = d * d;
phi -= (con * ds * _1_div_1_m_es) * FC2 * (1. -
ds * FC4 * (5. + t * (3. - 9. * n) + n * (1. - 4 * n) -
ds * FC6 * (61. + t * (90. - 252. * n +
45. * t) + 46. * n
- ds * FC8 * (1385. + t * (3633. + t * (4095. + 1574. * t)) )
)));
lam = d*(FC1 -
ds*FC3*( 1. + 2.*t + n -
ds*FC5*(5. + t*(28. + 24.*t + 8.*n) + 6.*n
- ds * FC7 * (61. + t * (662. + t * (1320. + 720. * t)) )
))) * invcosphi;
//if (fabs(lp.phi) >= HALFPI) {
// lp.phi = xy.y < 0. ? -HALFPI : HALFPI;
// lp.lam = 0.;
phi = ternary(at_pole,
ternary(xy_y < 0, L(-HALFPI), L(HALFPI)), phi);
lam = ternary(at_pole, 0, lam);
//if( (i % (NITER/100)) == 0 )
// printf("%.16g %.16g\n", lam.low() / M_PI * 180, phi.low() / M_PI * 180);
acc += phi.low() + lam.low();
acc += phi.high() + lam.high();
}
printf("Average of all longitude and latitudes: %.16g\n", acc / NITER);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment