Skip to content

Instantly share code, notes, and snippets.

@shibatch
Created January 6, 2019 06:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shibatch/70f48e153ecc9d586386dc1676d564cc to your computer and use it in GitHub Desktop.
Save shibatch/70f48e153ecc9d586386dc1676d564cc to your computer and use it in GitHub Desktop.
Prototype of SleefVML
#include "sleef.h"
#define INLINE inline
// Below is a part of helperavx2.h
#define VECTLENDP 4
#define VECTLENSP 8
typedef __m256i vmask;
typedef __m256i vopmask;
typedef __m256d vdouble;
typedef __m128i vint;
typedef __m256 vfloat;
typedef __m256i vint2;
static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
//
#include <cmath>
#include <algorithm>
#define SleefVML_HA 0 // Use 1-ulp accuracy functions (default)
#define SleefVML_LA (1 << 0) // Use 3.5-ulp accuracy functions
#define SleefVML_MT (1 << 2) // Multithread
namespace SleefVML {
template<typename t0, typename t1>
class AddOp {
t0 const arg0;
t1 const arg1;
public:
AddOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}
double evald1(int index, double** args) const {
return arg0.evald1(index, args) + arg1.evald1(index, args);
}
vdouble evalvd(int index, double** args) const {
return vadd_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
}
float evalf1(int index, float** args) const {
return arg0.evalf1(index, args) + arg1.evalf1(index, args);
}
vfloat evalvf(int index, float** args) const {
return vadd_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
}
};
template<typename t0, typename t1>
static inline AddOp<t0, t1> add(t0 const& arg0, t1 const& arg1) {
return AddOp<t0, t1>(arg0, arg1);
}
//
template<typename t0, typename t1>
class SubOp {
t0 const arg0;
t1 const arg1;
public:
SubOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}
double evald1(int index, double** args) const {
return arg0.evald1(index, args) - arg1.evald1(index, args);
}
vdouble evalvd(int index, double** args) const {
return vsub_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
}
float evalf1(int index, float** args) const {
return arg0.evalf1(index, args) - arg1.evalf1(index, args);
}
vfloat evalvf(int index, float** args) const {
return vsub_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
}
};
template<typename t0, typename t1>
static inline SubOp<t0, t1> sub(t0 const& arg0, t1 const& arg1) {
return SubOp<t0, t1>(arg0, arg1);
}
//
template<typename t0, typename t1>
class MulOp {
t0 const arg0;
t1 const arg1;
public:
MulOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}
double evald1(int index, double** args) const {
return arg0.evald1(index, args) * arg1.evald1(index, args);
}
vdouble evalvd(int index, double** args) const {
return vmul_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
}
float evalf1(int index, float** args) const {
return arg0.evalf1(index, args) * arg1.evalf1(index, args);
}
vfloat evalvf(int index, float** args) const {
return vmul_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
}
};
template<typename t0, typename t1>
static inline MulOp<t0, t1> mul(t0 const& arg0, t1 const& arg1) {
return MulOp<t0, t1>(arg0, arg1);
}
//
template<typename t0, typename t1>
class DivOp {
t0 const arg0;
t1 const arg1;
public:
DivOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}
double evald1(int index, double** args) const {
return arg0.evald1(index, args) / arg1.evald1(index, args);
}
vdouble evalvd(int index, double** args) const {
return vdiv_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
}
float evalf1(int index, float** args) const {
return arg0.evalf1(index, args) / arg1.evalf1(index, args);
}
vfloat evalvf(int index, float** args) const {
return vdiv_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
}
};
template<typename t0, typename t1>
static inline DivOp<t0, t1> div(t0 const& arg0, t1 const& arg1) {
return DivOp<t0, t1>(arg0, arg1);
}
//
class ConstOp {
double c;
public :
ConstOp(const double& c) : c(c) {}
double evald1(int index, double** args) const { return c; }
vdouble evalvd(int index, double** args) const { return vcast_vd_d(c); }
float evalf1(int index, float** args) const { return c; }
vfloat evalvf(int index, float** args) const { return vcast_vf_f(c); }
};
static inline ConstOp constant(double c) { return ConstOp(c); }
//
class LoadOp {
int n;
public :
LoadOp(int n) : n(n) {}
double evald1(int index, double** args) const { return args[n][index]; }
vdouble evalvd(int index, double** args) const { return vloadu_vd_p(&args[n][index]); }
float evalf1(int index, float** args) const { return args[n][index]; }
vfloat evalvf(int index, float** args) const { return vloadu_vf_p(&args[n][index]); }
};
static inline LoadOp load(int n) { return LoadOp(n); }
//
template<typename t0, typename t1, typename t2, typename t3>
class SelLEOp {
t0 const arg0;
t1 const arg1;
t2 const arg2;
t3 const arg3;
public:
SelLEOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) :
arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {}
double evald1(int index, double** args) const {
return arg0.evalvd(index, args) <= arg1.evalvd(index, args) ?
arg2.evalvd(index, args) : arg3.evalvd(index, args);
}
vdouble evalvd(int index, double** args) const {
return vsel_vd_vo_vd_vd(vle_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)),
arg2.evalvd(index, args), arg3.evalvd(index, args));
}
float evalf1(int index, float** args) const {
return arg0.evalf1(index, args) <= arg1.evalf1(index, args) ?
arg2.evalf1(index, args) : arg3.evalf1(index, args);
}
vfloat evalvf(int index, float** args) const {
return vsel_vf_vo_vf_vf(vle_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)),
arg2.evalvf(index, args), arg3.evalvf(index, args));
}
};
template<typename t0, typename t1, typename t2, typename t3>
static inline SelLEOp<t0, t1, t2, t3> selle(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) {
return SelLEOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3);
}
//
template<typename t0, typename t1, typename t2, typename t3>
class SelLTOp {
t0 const arg0;
t1 const arg1;
t2 const arg2;
t3 const arg3;
public:
SelLTOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) :
arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {}
double evald1(int index, double** args) const {
return arg0.evalvd(index, args) < arg1.evalvd(index, args) ?
arg2.evalvd(index, args) : arg3.evalvd(index, args);
}
vdouble evalvd(int index, double** args) const {
return vsel_vd_vo_vd_vd(vlt_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)),
arg2.evalvd(index, args), arg3.evalvd(index, args));
}
float evalf1(int index, float** args) const {
return arg0.evalf1(index, args) < arg1.evalf1(index, args) ?
arg2.evalf1(index, args) : arg3.evalf1(index, args);
}
vfloat evalvf(int index, float** args) const {
return vsel_vf_vo_vf_vf(vlt_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)),
arg2.evalvf(index, args), arg3.evalvf(index, args));
}
};
template<typename t0, typename t1, typename t2, typename t3>
static inline SelLTOp<t0, t1, t2, t3> sellt(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) {
return SelLTOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3);
}
//
static inline float negf(float x) { return -x; }
static inline double neg(double x) { return -x; }
static inline float absf(float x) { return x < 0 ? -x : x; }
#define UNARYFUNC(fnd, fnf, vfnd, vfnf) \
template<typename t0> class OpFunc_ ## fnd { \
t0 const arg0; \
public: \
OpFunc_ ## fnd(t0 const& arg0) : arg0(arg0) {} \
double evald1(int index, double** args) const { return fnd(arg0.evald1(index, args)); } \
vdouble evalvd(int index, double** args) const { return vfnd(arg0.evalvd(index, args)); } \
float evalf1(int index, float** args) const { return fnf(arg0.evalf1(index, args)); } \
vfloat evalvf(int index, float** args) const { return vfnf(arg0.evalvf(index, args)); } \
}; \
template<typename t0> OpFunc_ ## fnd<t0> static inline fnd(t0 const& arg0) { return OpFunc_ ## fnd<t0>(arg0); }
UNARYFUNC(sqrt, sqrtf, vsqrt_vd_vd, vsqrt_vf_vf)
UNARYFUNC(abs, abs, vabs_vd_vd, vabs_vf_vf)
UNARYFUNC(max, maxf, vmax_vd_vd, vmax_vf_vf)
UNARYFUNC(min, minf, vmin_vd_vd, vmin_vf_vf)
UNARYFUNC(neg, negf, vneg_vd_vd, vneg_vf_vf)
UNARYFUNC(Sleef_exp_u10, Sleef_expf_u10, Sleef_expd4_u10avx2, Sleef_expf8_u10avx2)
UNARYFUNC(Sleef_log_u10, Sleef_logf_u10, Sleef_logd4_u10avx2, Sleef_logf8_u10avx2)
UNARYFUNC(Sleef_atan_u10, Sleef_atanf_u10, Sleef_atand4_u10avx2, Sleef_atanf8_u10avx2)
//
template<typename t0>
OpFunc_neg<t0> operator-(t0 const& arg0) { return neg(arg0); }
template<typename t0, typename t1>
AddOp<t0, t1> operator+(t0 const& arg0, t1 const& arg1) { return add(arg0, arg1); }
template<typename t0, typename t1>
SubOp<t0, t1> operator-(t0 const& arg0, t1 const& arg1) { return sub(arg0, arg1); }
template<typename t0, typename t1>
MulOp<t0, t1> operator*(t0 const& arg0, t1 const& arg1) { return mul(arg0, arg1); }
template<typename t0, typename t1>
DivOp<t0, t1> operator/(t0 const& arg0, t1 const& arg1) { return div(arg0, arg1); }
template<typename t0>
AddOp<ConstOp, t0> operator+(double const& arg0, t0 const& arg1) { return add(constant(arg0), arg1); }
template<typename t0>
SubOp<ConstOp, t0> operator-(double const& arg0, t0 const& arg1) { return sub(constant(arg0), arg1); }
template<typename t0>
MulOp<ConstOp, t0> operator*(double const& arg0, t0 const& arg1) { return mul(constant(arg0), arg1); }
template<typename t0>
DivOp<ConstOp, t0> operator/(double const& arg0, t0 const& arg1) { return div(constant(arg0), arg1); }
template<typename t0>
AddOp<t0, ConstOp> operator+(t0 const& arg0, double const& arg1) { return add(arg0, constant(arg1)); }
template<typename t0>
SubOp<t0, ConstOp> operator-(t0 const& arg0, double const& arg1) { return sub(arg0, constant(arg1)); }
template<typename t0>
MulOp<t0, ConstOp> operator*(t0 const& arg0, double const& arg1) { return mul(arg0, constant(arg1)); }
template<typename t0>
DivOp<t0, ConstOp> operator/(t0 const& arg0, double const& arg1) { return div(arg0, constant(arg1)); }
//
template<typename t0>
INLINE void executed(uint64_t mode, const size_t n, double **args, double *ret, t0 op) {
int i;
#ifdef _OPENMP
if ((mode & SleefVML_MT) != 0) {
#pragma omp parallel for
for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args));
} else
#endif
{
for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args));
}
for(;i<n;i++) ret[i] = op.evald1(i, args);
}
template<typename t0>
INLINE void executef(uint64_t mode, const size_t n, float **args, float *ret, t0 op) {
int i;
#ifdef _OPENMP
if ((mode & SleefVML_MT) != 0) {
#pragma omp parallel for
for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args));
} else
#endif
{
for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args));
}
for(;i<n;i++) ret[i] = op.evalf1(i, args);
}
}
//
using namespace SleefVML;
extern "C" void SleefVML_vmsBinaryStep(size_t n, float *arg0, float *ret, uint64_t mode) {
float *args[] = { arg0 };
executef(mode, n, args, ret, sellt(load(0), constant(0), constant(0), constant(1)));
}
extern "C" void SleefVML_vmsSigmoid(size_t n, float *arg0, float *ret, uint64_t mode) {
float *args[] = { arg0 };
executef(mode, n, args, ret, 1.0 / (1.0 + Sleef_exp_u10(-load(0))));
}
extern "C" void SleefVML_vmsSoftSign(size_t n, float *arg0, float *ret, uint64_t mode) {
float *args[] = { arg0 };
executef(mode, n, args, ret, load(0) / (1.0 + abs(load(0))));
}
extern "C" void SleefVML_vmsISRU(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) {
float *args[] = { arg0 };
executef(mode, n, args, ret, load(0) / sqrt(1.0 + alpha * (load(0) * load(0))));
}
extern "C" void SleefVML_vmsSoftExponential(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) {
float *args[] = { arg0 };
if (alpha < 0) {
executef(mode, n, args, ret, -Sleef_log_u10(1.0-alpha*(load(0)+alpha))/alpha);
} else {
executef(mode, n, args, ret, (Sleef_exp_u10(alpha*load(0))-1.0)/alpha + alpha);
}
}
extern "C" void SleefVML_vmsSoftClipping(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) {
float *args[] = { arg0 };
executef(mode, n, args, ret, Sleef_log_u10((1.0+Sleef_exp_u10(alpha*load(0)))/(1.0+Sleef_exp_u10(alpha*(load(0)-1.0))))/alpha);
}
//
#include <stdlib.h>
#include <iostream>
double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }
int main(int argc, char **argv) {
const int N = 100;
float content[2][N], ret[N];
for(int j=0;j<2;j++) {
for(int i=0;i<N;i++) content[j][i] = (2.0 * rand() / RAND_MAX) - 1.0;
}
float *args[] = { &content[0][0], &content[1][0] };
SleefVML_vmsSigmoid(N, args[0], ret, 0);
for(int i=0;i<N;i++) std::cout << ret[i] << ", " << sigmoid(args[0][i]) << "\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment