-
-
Save rouault/3bbc31c9f12391d79920 to your computer and use it in GitHub Desktop.
Code from PJ_tmer.c + pj_mlfn.c ported to SSE2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Download sleef 2.80 | |
// Put this file as testprojsse2.cpp into sleef-2.80/simd directory | |
// and compile with g++ -DENABLE_SSE2 -O2 testprojsse2.cpp sleefsimddp.c -I. -o testprojsse2 | |
#include <math.h> | |
#include <stdio.h> | |
extern "C" | |
{ | |
#if defined(ENABLE_SSE2) | |
#include "sleefsimd.h" | |
#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__) | |
#define INLINE __attribute__((always_inline)) | |
#elif defined(_MSC_VER) | |
#define INLINE __inline | |
#else | |
#define INLINE inline | |
#endif | |
#include "helpersse2.h" | |
#endif | |
} | |
#if defined(_MSC_VER) | |
#define ALIGN(x) _CRT_ALIGN(x) | |
#else | |
#define ALIGN(x) __attribute__ ((aligned (x))) | |
#endif | |
#ifndef M_PI | |
# define M_PI 3.14159265358979323846 /* pi */ | |
#endif | |
#define HALFPI (M_PI/2) | |
#define EPS10 1.e-10 | |
#define FC1 1. | |
#define FC2 .5 | |
#define FC3 .16666666666666666666 | |
#define FC4 .08333333333333333333 | |
#define FC5 .05 | |
#define FC6 .03333333333333333333 | |
#define FC7 .02380952380952380952 | |
#define FC8 .01785714285714285714 | |
#define EN_SIZE 5 | |
#define C00 1. | |
#define C02 .25 | |
#define C04 .046875 | |
#define C06 .01953125 | |
#define C08 .01068115234375 | |
#define C22 .75 | |
#define C44 .46875 | |
#define C46 .01302083333333333333 | |
#define C48 .00712076822916666666 | |
#define C66 .36458333333333333333 | |
#define C68 .00569661458333333333 | |
#define C88 .3076171875 | |
#define EPS 1e-11 | |
#define MAX_ITER 10 | |
void compute_mlfn_coeffs(double* en, double es) | |
{ | |
double t; | |
en[0] = C00 - es * (C02 + es * (C04 + es * (C06 + es * C08))); | |
en[1] = es * (C22 - es * (C04 + es * (C06 + es * C08))); | |
en[2] = (t = es * es) * (C44 - es * (C46 + es * C48)); | |
en[3] = (t *= es) * (C66 - es * C68); | |
en[4] = t * es * C88; | |
} | |
typedef struct { double x, y; } XY; | |
typedef struct { double lam, phi; } LP; | |
#define L(x) XMMReg2Double::Load(x) | |
class XMMReg2Double | |
{ | |
public: | |
__m128d xmm; | |
XMMReg2Double() {} | |
XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {} | |
static inline XMMReg2Double Load(double d) | |
{ | |
XMMReg2Double reg; | |
reg.xmm = _mm_set1_pd(d); | |
return reg; | |
} | |
static inline XMMReg2Double Load(double low, double high) | |
{ | |
XMMReg2Double reg; | |
reg.xmm = _mm_set_pd(high, low); | |
return reg; | |
} | |
inline const XMMReg2Double& operator= (const XMMReg2Double& other) | |
{ | |
xmm = other.xmm; | |
return *this; | |
} | |
inline const XMMReg2Double& operator+= (const XMMReg2Double& other) | |
{ | |
xmm = _mm_add_pd(xmm, other.xmm); | |
return *this; | |
} | |
inline XMMReg2Double operator+ (const XMMReg2Double& other) const | |
{ | |
XMMReg2Double ret; | |
ret.xmm = _mm_add_pd(xmm, other.xmm); | |
return ret; | |
} | |
inline XMMReg2Double operator- (const XMMReg2Double& other) const | |
{ | |
XMMReg2Double ret; | |
ret.xmm = _mm_sub_pd(xmm, other.xmm); | |
return ret; | |
} | |
inline XMMReg2Double operator* (const XMMReg2Double& other) const | |
{ | |
XMMReg2Double ret; | |
ret.xmm = _mm_mul_pd(xmm, other.xmm); | |
return ret; | |
} | |
inline XMMReg2Double operator/ (const XMMReg2Double& other) const | |
{ | |
XMMReg2Double ret; | |
ret.xmm = _mm_div_pd(xmm, other.xmm); | |
return ret; | |
} | |
inline const XMMReg2Double& operator-= (const XMMReg2Double& other) | |
{ | |
xmm = _mm_sub_pd(xmm, other.xmm); | |
return *this; | |
} | |
inline const XMMReg2Double& operator*= (const XMMReg2Double& other) | |
{ | |
xmm = _mm_mul_pd(xmm, other.xmm); | |
return *this; | |
} | |
inline const XMMReg2Double& operator*= (const double& other) | |
{ | |
xmm = _mm_mul_pd(xmm, L(other).xmm); | |
return *this; | |
} | |
inline XMMReg2Double sqrt() const | |
{ | |
XMMReg2Double ret; | |
ret.xmm = _mm_sqrt_pd(xmm); | |
return ret; | |
} | |
inline XMMReg2Double fabs() const | |
{ | |
XMMReg2Double ret; | |
ret.xmm = _mm_andnot_pd(_mm_set_pd(-0.0,-0.0), xmm); | |
return ret; | |
} | |
inline double castToDouble() const | |
{ | |
double val; | |
_mm_store_sd(&val, xmm); | |
return val; | |
} | |
inline double low() const | |
{ | |
return castToDouble(); | |
} | |
inline double high() const | |
{ | |
XMMReg2Double tmp; | |
tmp.xmm = _mm_unpackhi_pd(xmm, xmm); | |
return tmp.castToDouble(); | |
} | |
friend vmask operator<= (const XMMReg2Double& reg1, const XMMReg2Double& reg2); | |
friend vmask operator< (const XMMReg2Double& reg1, const XMMReg2Double& reg2); | |
friend vmask operator>= (const XMMReg2Double& reg1, const XMMReg2Double& reg2); | |
friend vmask operator> (const XMMReg2Double& reg1, const XMMReg2Double& reg2); | |
friend vmask operator<= (const XMMReg2Double& reg1, const double& reg2); | |
friend vmask operator< (const XMMReg2Double& reg1, const double& reg2); | |
friend vmask operator>= (const XMMReg2Double& reg1, const double& reg2); | |
friend vmask operator> (const XMMReg2Double& reg1, const double& reg2); | |
friend XMMReg2Double operator+ (const XMMReg2Double& reg1, double val2); | |
friend XMMReg2Double operator- (const XMMReg2Double& reg1, double val2); | |
friend XMMReg2Double operator* (const XMMReg2Double& reg1, double val2); | |
friend XMMReg2Double operator+ (double val1, const XMMReg2Double& reg2); | |
friend XMMReg2Double operator- (double val1, const XMMReg2Double& reg2); | |
friend XMMReg2Double operator* (double val1, const XMMReg2Double& reg2); | |
friend XMMReg2Double operator/ (double val1, const XMMReg2Double& reg2); | |
}; | |
vmask operator<= (const XMMReg2Double& reg1, const XMMReg2Double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmple_pd(reg1.xmm, reg2.xmm)); | |
} | |
vmask operator< (const XMMReg2Double& reg1, const XMMReg2Double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmplt_pd(reg1.xmm, reg2.xmm)); | |
} | |
vmask operator>= (const XMMReg2Double& reg1, const XMMReg2Double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmpge_pd(reg1.xmm, reg2.xmm)); | |
} | |
vmask operator> (const XMMReg2Double& reg1, const XMMReg2Double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmpgt_pd(reg1.xmm, reg2.xmm)); | |
} | |
vmask operator<= (const XMMReg2Double& reg1, const double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmple_pd(reg1.xmm, L(reg2).xmm)); | |
} | |
vmask operator< (const XMMReg2Double& reg1, const double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmplt_pd(reg1.xmm, L(reg2).xmm)); | |
} | |
vmask operator>= (const XMMReg2Double& reg1, const double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmpge_pd(reg1.xmm, L(reg2).xmm)); | |
} | |
vmask operator> (const XMMReg2Double& reg1, const double& reg2) | |
{ | |
return _mm_castpd_si128(_mm_cmpgt_pd(reg1.xmm, L(reg2).xmm)); | |
} | |
XMMReg2Double operator+ (const XMMReg2Double& reg1, double val2) | |
{ | |
XMMReg2Double ret; | |
ret = reg1 + L(val2); | |
return ret; | |
} | |
XMMReg2Double operator- (const XMMReg2Double& reg1, double val2) | |
{ | |
XMMReg2Double ret; | |
ret = reg1 - L(val2); | |
return ret; | |
} | |
XMMReg2Double operator* (const XMMReg2Double& reg1, double val2) | |
{ | |
XMMReg2Double ret; | |
ret = reg1 * L(val2); | |
return ret; | |
} | |
XMMReg2Double operator+ (double val1, const XMMReg2Double& reg2) | |
{ | |
XMMReg2Double ret; | |
ret = L(val1) + reg2; | |
return ret; | |
} | |
XMMReg2Double operator- (double val1, const XMMReg2Double& reg2) | |
{ | |
XMMReg2Double ret; | |
ret = L(val1) - reg2; | |
return ret; | |
} | |
XMMReg2Double operator* (double val1, const XMMReg2Double& reg2) | |
{ | |
XMMReg2Double ret; | |
ret = L(val1) * reg2; | |
return ret; | |
} | |
XMMReg2Double operator/ (double val1, const XMMReg2Double& reg2) | |
{ | |
XMMReg2Double ret; | |
ret = L(val1) / reg2; | |
return ret; | |
} | |
static inline XMMReg2Double ternary(vmask cond, const XMMReg2Double& val_true, const XMMReg2Double& val_false) | |
{ | |
XMMReg2Double ret; | |
ret.xmm = vsel_vd_vm_vd_vd(cond, val_true.xmm, val_false.xmm); | |
return ret; | |
} | |
static inline XMMReg2Double ternary(vmask cond, const XMMReg2Double& val_true, const double& val_false) | |
{ | |
XMMReg2Double ret; | |
ret.xmm = vsel_vd_vm_vd_vd(cond, val_true.xmm, L(val_false).xmm); | |
return ret; | |
} | |
static inline XMMReg2Double ternary(vmask cond, const double& val_true, const XMMReg2Double& val_false) | |
{ | |
XMMReg2Double ret; | |
ret.xmm = vsel_vd_vm_vd_vd(cond, L(val_true).xmm, val_false.xmm); | |
return ret; | |
} | |
static inline XMMReg2Double sqrt(const XMMReg2Double& x) | |
{ | |
return x.sqrt(); | |
} | |
static inline XMMReg2Double fabs(const XMMReg2Double& x) | |
{ | |
return x.fabs(); | |
} | |
static inline bool TrueForAllMembers(const vmask& cond) | |
{ | |
return _mm_movemask_pd(vreinterpret_vd_vm(cond)) == 3; | |
} | |
double | |
pj_mlfn(double phi, double sphi, double cphi, double *en) | |
{ | |
cphi *= sphi; | |
sphi *= sphi; | |
return(en[0] * phi - cphi * (en[1] + sphi*(en[2] | |
+ sphi*(en[3] + sphi*en[4])))); | |
} | |
XMMReg2Double inline | |
pj_mlfn(const XMMReg2Double& phi, const XMMReg2Double& sphi_, const XMMReg2Double& cphi_, const XMMReg2Double* en) | |
{ | |
XMMReg2Double cphi(cphi_); | |
XMMReg2Double sphi(sphi_); | |
cphi *= sphi; | |
sphi *= sphi; | |
return(en[0] * phi - cphi * (en[1] + sphi*(en[2] | |
+ sphi*(en[3] + sphi*en[4])))); | |
} | |
void inline sincos(const XMMReg2Double& d, XMMReg2Double* sind, XMMReg2Double* cosd) | |
{ | |
#ifdef SLOW | |
// x87 variant | |
double sinphi_low, cosphi_low, sinphi_high, cosphi_high; | |
sincos(d.low(), &sinphi_low, &cosphi_low); | |
sincos(d.high(), &sinphi_high, &cosphi_high); | |
*sind = XMMReg2Double::Load(sinphi_low, sinphi_high); | |
*cosd = XMMReg2Double::Load(cosphi_low, cosphi_high); | |
#else | |
// sleef xsincos() | |
vdouble2 sincosres = xsincos((vdouble)d.xmm); | |
sind->xmm = sincosres.x; | |
cosd->xmm = sincosres.y; | |
#endif | |
} | |
XMMReg2Double inline | |
pj_inv_mlfn(const XMMReg2Double* en, const XMMReg2Double& arg, const XMMReg2Double& es, | |
const XMMReg2Double& _1_div_1_m_es) | |
{ | |
XMMReg2Double s, t, phi, k = _1_div_1_m_es; | |
int i; | |
phi = arg; | |
for (i = MAX_ITER; i ; --i) /* rarely goes over 2 iterations */ | |
{ | |
XMMReg2Double s, cphi; | |
sincos(phi, &s, &cphi); | |
t = 1 - es * s * s; | |
phi -= t = (pj_mlfn(phi, s, cphi, en) - arg) * (t * sqrt(t)) * k; | |
if( TrueForAllMembers(fabs(t) < L(EPS)) ) | |
return phi; | |
} | |
//pj_ctx_set_errno( ctx, -17 ); | |
return phi; | |
} | |
int main(int argc, char* argv[]) | |
{ | |
double y0 = 0; | |
double x0 = 500000.; | |
double lam0 = -3; | |
double k0 = 0.9996; | |
double inv_k0 = 1. / k0; | |
double phi0 = 0.; | |
double a = 6378137; | |
double ra = 1./a; | |
double invf = 298.257223563; | |
double f = 1 / invf; | |
double es = 2 *f - f*f; | |
double en[EN_SIZE]; | |
double ml0, esp; | |
int i; | |
double acc = 0; | |
double _1_div_1_m_es = 1. / (1.-es); | |
compute_mlfn_coeffs(en, es); | |
ml0 = pj_mlfn(phi0, sin(phi0), cos(phi0), en); | |
esp = es / (1. - es); | |
XMMReg2Double ALIGN(16) v_en[5]; | |
for(int j=0;j<5;j++) | |
v_en[j] = XMMReg2Double::Load(en[j]); | |
#define NITER (100*1000*1000) | |
for(i=0;i<NITER;i+=2) | |
{ | |
LP lp; | |
XY xy; | |
XMMReg2Double phi, lam, n, con, cosphi, invcosphi, d, ds, sinphi, t; | |
XMMReg2Double xy_x, xy_y; | |
xy_x = XMMReg2Double::Load((1000000.*i) / NITER, (1000000.*(i+1)) / NITER); | |
xy_y = XMMReg2Double::Load((9000000.*i) / NITER, (9000000.*(i+1)) / NITER); | |
//xy_x = XMMReg2Double::Load(573142.012282725-x0,573142.012282725-x0); | |
//xy_y = XMMReg2Double::Load(5427937.52346616,5427937.52346616); | |
xy_x *= ra; | |
xy_y *= ra; | |
phi = pj_inv_mlfn(v_en, ml0 + xy_y * inv_k0, L(es), L(_1_div_1_m_es) ); | |
vmask at_pole = fabs(phi) >= HALFPI; | |
sincos(phi, &sinphi, &cosphi); | |
invcosphi = 1.0 / cosphi; | |
t = ternary( fabs(cosphi) > 1e-10, sinphi * invcosphi, 0 ); | |
n = esp * cosphi * cosphi; | |
d = xy_x * sqrt(con = 1. - es * sinphi * sinphi) * inv_k0; | |
con *= t; | |
t *= t; | |
ds = d * d; | |
phi -= (con * ds * _1_div_1_m_es) * FC2 * (1. - | |
ds * FC4 * (5. + t * (3. - 9. * n) + n * (1. - 4 * n) - | |
ds * FC6 * (61. + t * (90. - 252. * n + | |
45. * t) + 46. * n | |
- ds * FC8 * (1385. + t * (3633. + t * (4095. + 1574. * t)) ) | |
))); | |
lam = d*(FC1 - | |
ds*FC3*( 1. + 2.*t + n - | |
ds*FC5*(5. + t*(28. + 24.*t + 8.*n) + 6.*n | |
- ds * FC7 * (61. + t * (662. + t * (1320. + 720. * t)) ) | |
))) * invcosphi; | |
//if (fabs(lp.phi) >= HALFPI) { | |
// lp.phi = xy.y < 0. ? -HALFPI : HALFPI; | |
// lp.lam = 0.; | |
phi = ternary(at_pole, | |
ternary(xy_y < 0, L(-HALFPI), L(HALFPI)), phi); | |
lam = ternary(at_pole, 0, lam); | |
//if( (i % (NITER/100)) == 0 ) | |
// printf("%.16g %.16g\n", lam.low() / M_PI * 180, phi.low() / M_PI * 180); | |
acc += phi.low() + lam.low(); | |
acc += phi.high() + lam.high(); | |
} | |
printf("Average of all longitude and latitudes: %.16g\n", acc / NITER); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment