Created
January 6, 2019 06:00
-
-
Save shibatch/70f48e153ecc9d586386dc1676d564cc to your computer and use it in GitHub Desktop.
Prototype of SleefVML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "sleef.h" | |
#define INLINE inline | |
// Below is a part of helperavx2.h | |
#define VECTLENDP 4 | |
#define VECTLENSP 8 | |
typedef __m256i vmask; | |
typedef __m256i vopmask; | |
typedef __m256d vdouble; | |
typedef __m128i vint; | |
typedef __m256 vfloat; | |
typedef __m256i vint2; | |
static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); } | |
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); } | |
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); } | |
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); } | |
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); } | |
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); } | |
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } | |
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } | |
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } | |
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } | |
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } | |
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } | |
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); } | |
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } | |
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); } | |
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); } | |
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } | |
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } | |
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } | |
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); } | |
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); } | |
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); } | |
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); } | |
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); } | |
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); } | |
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); } | |
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); } | |
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); } | |
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } | |
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } | |
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } | |
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } | |
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } | |
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } | |
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } | |
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } | |
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } | |
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } | |
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } | |
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); } | |
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); } | |
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); } | |
// | |
#include <cmath> | |
#include <algorithm> | |
#define SleefVML_HA 0 // Use 1-ulp accuracy functions (default) | |
#define SleefVML_LA (1 << 0) // Use 3.5-ulp accuracy functions | |
#define SleefVML_MT (1 << 2) // Multithread | |
namespace SleefVML { | |
template<typename t0, typename t1> | |
class AddOp { | |
t0 const arg0; | |
t1 const arg1; | |
public: | |
AddOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {} | |
double evald1(int index, double** args) const { | |
return arg0.evald1(index, args) + arg1.evald1(index, args); | |
} | |
vdouble evalvd(int index, double** args) const { | |
return vadd_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)); | |
} | |
float evalf1(int index, float** args) const { | |
return arg0.evalf1(index, args) + arg1.evalf1(index, args); | |
} | |
vfloat evalvf(int index, float** args) const { | |
return vadd_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)); | |
} | |
}; | |
template<typename t0, typename t1> | |
static inline AddOp<t0, t1> add(t0 const& arg0, t1 const& arg1) { | |
return AddOp<t0, t1>(arg0, arg1); | |
} | |
// | |
template<typename t0, typename t1> | |
class SubOp { | |
t0 const arg0; | |
t1 const arg1; | |
public: | |
SubOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {} | |
double evald1(int index, double** args) const { | |
return arg0.evald1(index, args) - arg1.evald1(index, args); | |
} | |
vdouble evalvd(int index, double** args) const { | |
return vsub_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)); | |
} | |
float evalf1(int index, float** args) const { | |
return arg0.evalf1(index, args) - arg1.evalf1(index, args); | |
} | |
vfloat evalvf(int index, float** args) const { | |
return vsub_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)); | |
} | |
}; | |
template<typename t0, typename t1> | |
static inline SubOp<t0, t1> sub(t0 const& arg0, t1 const& arg1) { | |
return SubOp<t0, t1>(arg0, arg1); | |
} | |
// | |
template<typename t0, typename t1> | |
class MulOp { | |
t0 const arg0; | |
t1 const arg1; | |
public: | |
MulOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {} | |
double evald1(int index, double** args) const { | |
return arg0.evald1(index, args) * arg1.evald1(index, args); | |
} | |
vdouble evalvd(int index, double** args) const { | |
return vmul_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)); | |
} | |
float evalf1(int index, float** args) const { | |
return arg0.evalf1(index, args) * arg1.evalf1(index, args); | |
} | |
vfloat evalvf(int index, float** args) const { | |
return vmul_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)); | |
} | |
}; | |
template<typename t0, typename t1> | |
static inline MulOp<t0, t1> mul(t0 const& arg0, t1 const& arg1) { | |
return MulOp<t0, t1>(arg0, arg1); | |
} | |
// | |
template<typename t0, typename t1> | |
class DivOp { | |
t0 const arg0; | |
t1 const arg1; | |
public: | |
DivOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {} | |
double evald1(int index, double** args) const { | |
return arg0.evald1(index, args) / arg1.evald1(index, args); | |
} | |
vdouble evalvd(int index, double** args) const { | |
return vdiv_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)); | |
} | |
float evalf1(int index, float** args) const { | |
return arg0.evalf1(index, args) / arg1.evalf1(index, args); | |
} | |
vfloat evalvf(int index, float** args) const { | |
return vdiv_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)); | |
} | |
}; | |
template<typename t0, typename t1> | |
static inline DivOp<t0, t1> div(t0 const& arg0, t1 const& arg1) { | |
return DivOp<t0, t1>(arg0, arg1); | |
} | |
// | |
class ConstOp { | |
double c; | |
public : | |
ConstOp(const double& c) : c(c) {} | |
double evald1(int index, double** args) const { return c; } | |
vdouble evalvd(int index, double** args) const { return vcast_vd_d(c); } | |
float evalf1(int index, float** args) const { return c; } | |
vfloat evalvf(int index, float** args) const { return vcast_vf_f(c); } | |
}; | |
static inline ConstOp constant(double c) { return ConstOp(c); } | |
// | |
class LoadOp { | |
int n; | |
public : | |
LoadOp(int n) : n(n) {} | |
double evald1(int index, double** args) const { return args[n][index]; } | |
vdouble evalvd(int index, double** args) const { return vloadu_vd_p(&args[n][index]); } | |
float evalf1(int index, float** args) const { return args[n][index]; } | |
vfloat evalvf(int index, float** args) const { return vloadu_vf_p(&args[n][index]); } | |
}; | |
static inline LoadOp load(int n) { return LoadOp(n); } | |
// | |
template<typename t0, typename t1, typename t2, typename t3> | |
class SelLEOp { | |
t0 const arg0; | |
t1 const arg1; | |
t2 const arg2; | |
t3 const arg3; | |
public: | |
SelLEOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) : | |
arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {} | |
double evald1(int index, double** args) const { | |
return arg0.evalvd(index, args) <= arg1.evalvd(index, args) ? | |
arg2.evalvd(index, args) : arg3.evalvd(index, args); | |
} | |
vdouble evalvd(int index, double** args) const { | |
return vsel_vd_vo_vd_vd(vle_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)), | |
arg2.evalvd(index, args), arg3.evalvd(index, args)); | |
} | |
float evalf1(int index, float** args) const { | |
return arg0.evalf1(index, args) <= arg1.evalf1(index, args) ? | |
arg2.evalf1(index, args) : arg3.evalf1(index, args); | |
} | |
vfloat evalvf(int index, float** args) const { | |
return vsel_vf_vo_vf_vf(vle_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)), | |
arg2.evalvf(index, args), arg3.evalvf(index, args)); | |
} | |
}; | |
template<typename t0, typename t1, typename t2, typename t3> | |
static inline SelLEOp<t0, t1, t2, t3> selle(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) { | |
return SelLEOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3); | |
} | |
// | |
template<typename t0, typename t1, typename t2, typename t3> | |
class SelLTOp { | |
t0 const arg0; | |
t1 const arg1; | |
t2 const arg2; | |
t3 const arg3; | |
public: | |
SelLTOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) : | |
arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {} | |
double evald1(int index, double** args) const { | |
return arg0.evalvd(index, args) < arg1.evalvd(index, args) ? | |
arg2.evalvd(index, args) : arg3.evalvd(index, args); | |
} | |
vdouble evalvd(int index, double** args) const { | |
return vsel_vd_vo_vd_vd(vlt_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)), | |
arg2.evalvd(index, args), arg3.evalvd(index, args)); | |
} | |
float evalf1(int index, float** args) const { | |
return arg0.evalf1(index, args) < arg1.evalf1(index, args) ? | |
arg2.evalf1(index, args) : arg3.evalf1(index, args); | |
} | |
vfloat evalvf(int index, float** args) const { | |
return vsel_vf_vo_vf_vf(vlt_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)), | |
arg2.evalvf(index, args), arg3.evalvf(index, args)); | |
} | |
}; | |
template<typename t0, typename t1, typename t2, typename t3> | |
static inline SelLTOp<t0, t1, t2, t3> sellt(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) { | |
return SelLTOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3); | |
} | |
// | |
static inline float negf(float x) { return -x; } | |
static inline double neg(double x) { return -x; } | |
static inline float absf(float x) { return x < 0 ? -x : x; } | |
#define UNARYFUNC(fnd, fnf, vfnd, vfnf) \ | |
template<typename t0> class OpFunc_ ## fnd { \ | |
t0 const arg0; \ | |
public: \ | |
OpFunc_ ## fnd(t0 const& arg0) : arg0(arg0) {} \ | |
double evald1(int index, double** args) const { return fnd(arg0.evald1(index, args)); } \ | |
vdouble evalvd(int index, double** args) const { return vfnd(arg0.evalvd(index, args)); } \ | |
float evalf1(int index, float** args) const { return fnf(arg0.evalf1(index, args)); } \ | |
vfloat evalvf(int index, float** args) const { return vfnf(arg0.evalvf(index, args)); } \ | |
}; \ | |
template<typename t0> OpFunc_ ## fnd<t0> static inline fnd(t0 const& arg0) { return OpFunc_ ## fnd<t0>(arg0); } | |
UNARYFUNC(sqrt, sqrtf, vsqrt_vd_vd, vsqrt_vf_vf) | |
UNARYFUNC(abs, abs, vabs_vd_vd, vabs_vf_vf) | |
UNARYFUNC(max, maxf, vmax_vd_vd, vmax_vf_vf) | |
UNARYFUNC(min, minf, vmin_vd_vd, vmin_vf_vf) | |
UNARYFUNC(neg, negf, vneg_vd_vd, vneg_vf_vf) | |
UNARYFUNC(Sleef_exp_u10, Sleef_expf_u10, Sleef_expd4_u10avx2, Sleef_expf8_u10avx2) | |
UNARYFUNC(Sleef_log_u10, Sleef_logf_u10, Sleef_logd4_u10avx2, Sleef_logf8_u10avx2) | |
UNARYFUNC(Sleef_atan_u10, Sleef_atanf_u10, Sleef_atand4_u10avx2, Sleef_atanf8_u10avx2) | |
// | |
template<typename t0> | |
OpFunc_neg<t0> operator-(t0 const& arg0) { return neg(arg0); } | |
template<typename t0, typename t1> | |
AddOp<t0, t1> operator+(t0 const& arg0, t1 const& arg1) { return add(arg0, arg1); } | |
template<typename t0, typename t1> | |
SubOp<t0, t1> operator-(t0 const& arg0, t1 const& arg1) { return sub(arg0, arg1); } | |
template<typename t0, typename t1> | |
MulOp<t0, t1> operator*(t0 const& arg0, t1 const& arg1) { return mul(arg0, arg1); } | |
template<typename t0, typename t1> | |
DivOp<t0, t1> operator/(t0 const& arg0, t1 const& arg1) { return div(arg0, arg1); } | |
template<typename t0> | |
AddOp<ConstOp, t0> operator+(double const& arg0, t0 const& arg1) { return add(constant(arg0), arg1); } | |
template<typename t0> | |
SubOp<ConstOp, t0> operator-(double const& arg0, t0 const& arg1) { return sub(constant(arg0), arg1); } | |
template<typename t0> | |
MulOp<ConstOp, t0> operator*(double const& arg0, t0 const& arg1) { return mul(constant(arg0), arg1); } | |
template<typename t0> | |
DivOp<ConstOp, t0> operator/(double const& arg0, t0 const& arg1) { return div(constant(arg0), arg1); } | |
template<typename t0> | |
AddOp<t0, ConstOp> operator+(t0 const& arg0, double const& arg1) { return add(arg0, constant(arg1)); } | |
template<typename t0> | |
SubOp<t0, ConstOp> operator-(t0 const& arg0, double const& arg1) { return sub(arg0, constant(arg1)); } | |
template<typename t0> | |
MulOp<t0, ConstOp> operator*(t0 const& arg0, double const& arg1) { return mul(arg0, constant(arg1)); } | |
template<typename t0> | |
DivOp<t0, ConstOp> operator/(t0 const& arg0, double const& arg1) { return div(arg0, constant(arg1)); } | |
// | |
template<typename t0> | |
INLINE void executed(uint64_t mode, const size_t n, double **args, double *ret, t0 op) { | |
int i; | |
#ifdef _OPENMP | |
if ((mode & SleefVML_MT) != 0) { | |
#pragma omp parallel for | |
for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args)); | |
} else | |
#endif | |
{ | |
for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args)); | |
} | |
for(;i<n;i++) ret[i] = op.evald1(i, args); | |
} | |
template<typename t0> | |
INLINE void executef(uint64_t mode, const size_t n, float **args, float *ret, t0 op) { | |
int i; | |
#ifdef _OPENMP | |
if ((mode & SleefVML_MT) != 0) { | |
#pragma omp parallel for | |
for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args)); | |
} else | |
#endif | |
{ | |
for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args)); | |
} | |
for(;i<n;i++) ret[i] = op.evalf1(i, args); | |
} | |
} | |
// | |
using namespace SleefVML; | |
extern "C" void SleefVML_vmsBinaryStep(size_t n, float *arg0, float *ret, uint64_t mode) { | |
float *args[] = { arg0 }; | |
executef(mode, n, args, ret, sellt(load(0), constant(0), constant(0), constant(1))); | |
} | |
extern "C" void SleefVML_vmsSigmoid(size_t n, float *arg0, float *ret, uint64_t mode) { | |
float *args[] = { arg0 }; | |
executef(mode, n, args, ret, 1.0 / (1.0 + Sleef_exp_u10(-load(0)))); | |
} | |
extern "C" void SleefVML_vmsSoftSign(size_t n, float *arg0, float *ret, uint64_t mode) { | |
float *args[] = { arg0 }; | |
executef(mode, n, args, ret, load(0) / (1.0 + abs(load(0)))); | |
} | |
extern "C" void SleefVML_vmsISRU(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) { | |
float *args[] = { arg0 }; | |
executef(mode, n, args, ret, load(0) / sqrt(1.0 + alpha * (load(0) * load(0)))); | |
} | |
extern "C" void SleefVML_vmsSoftExponential(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) { | |
float *args[] = { arg0 }; | |
if (alpha < 0) { | |
executef(mode, n, args, ret, -Sleef_log_u10(1.0-alpha*(load(0)+alpha))/alpha); | |
} else { | |
executef(mode, n, args, ret, (Sleef_exp_u10(alpha*load(0))-1.0)/alpha + alpha); | |
} | |
} | |
extern "C" void SleefVML_vmsSoftClipping(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) { | |
float *args[] = { arg0 }; | |
executef(mode, n, args, ret, Sleef_log_u10((1.0+Sleef_exp_u10(alpha*load(0)))/(1.0+Sleef_exp_u10(alpha*(load(0)-1.0))))/alpha); | |
} | |
// | |
#include <stdlib.h> | |
#include <iostream> | |
double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); } | |
int main(int argc, char **argv) { | |
const int N = 100; | |
float content[2][N], ret[N]; | |
for(int j=0;j<2;j++) { | |
for(int i=0;i<N;i++) content[j][i] = (2.0 * rand() / RAND_MAX) - 1.0; | |
} | |
float *args[] = { &content[0][0], &content[1][0] }; | |
SleefVML_vmsSigmoid(N, args[0], ret, 0); | |
for(int i=0;i<N;i++) std::cout << ret[i] << ", " << sigmoid(args[0][i]) << "\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment