shibatch/sleefvml_test.cpp

## sleefvml_test.cpp
#include "sleef.h"

#define INLINE inline

// Below is a part of helperavx2.h

#define VECTLENDP 4
#define VECTLENSP 8

typedef __m256i vmask;
typedef __m256i vopmask;
typedef __m256d vdouble;
typedef __m128i vint;
typedef __m256 vfloat;
typedef __m256i vint2;

static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }

static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }

static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }

//

#include <cmath>
#include <algorithm>

#define SleefVML_HA 0                  // Use 1-ulp   accuracy functions (default)
#define SleefVML_LA (1 << 0)           // Use 3.5-ulp accuracy functions
#define SleefVML_MT (1 << 2)           // Multithread

namespace SleefVML {
  template<typename t0, typename t1>
  class AddOp {
    t0 const arg0;
    t1 const arg1;

  public:
    AddOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

    double evald1(int index, double** args) const {
      return arg0.evald1(index, args) + arg1.evald1(index, args);
    }

    vdouble evalvd(int index, double** args) const {
      return vadd_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
    }

    float evalf1(int index, float** args) const {
      return arg0.evalf1(index, args) + arg1.evalf1(index, args);
    }

    vfloat evalvf(int index, float** args) const {
      return vadd_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
    }
  };

  template<typename t0, typename t1>
  static inline AddOp<t0, t1> add(t0 const& arg0, t1 const& arg1) {
    return AddOp<t0, t1>(arg0, arg1);
  }

  //

 template<typename t0, typename t1>
  class SubOp {
    t0 const arg0;
    t1 const arg1;

  public:
    SubOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

    double evald1(int index, double** args) const {
      return arg0.evald1(index, args) - arg1.evald1(index, args);
    }

    vdouble evalvd(int index, double** args) const {
      return vsub_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
    }

    float evalf1(int index, float** args) const {
      return arg0.evalf1(index, args) - arg1.evalf1(index, args);
    }

    vfloat evalvf(int index, float** args) const {
      return vsub_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
    }
  };

  template<typename t0, typename t1>
  static inline SubOp<t0, t1> sub(t0 const& arg0, t1 const& arg1) {
    return SubOp<t0, t1>(arg0, arg1);
  }

  //

  template<typename t0, typename t1>
  class MulOp {
    t0 const arg0;
    t1 const arg1;

  public:
    MulOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

    double evald1(int index, double** args) const {
      return arg0.evald1(index, args) * arg1.evald1(index, args);
    }

    vdouble evalvd(int index, double** args) const {
      return vmul_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
    }

    float evalf1(int index, float** args) const {
      return arg0.evalf1(index, args) * arg1.evalf1(index, args);
    }

    vfloat evalvf(int index, float** args) const {
      return vmul_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
    }
  };

  template<typename t0, typename t1>
  static inline MulOp<t0, t1> mul(t0 const& arg0, t1 const& arg1) {
    return MulOp<t0, t1>(arg0, arg1);
  }

  //

  template<typename t0, typename t1>
  class DivOp {
    t0 const arg0;
    t1 const arg1;

  public:
    DivOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

    double evald1(int index, double** args) const {
      return arg0.evald1(index, args) / arg1.evald1(index, args);
    }

    vdouble evalvd(int index, double** args) const {
      return vdiv_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
    }

    float evalf1(int index, float** args) const {
      return arg0.evalf1(index, args) / arg1.evalf1(index, args);
    }

    vfloat evalvf(int index, float** args) const {
      return vdiv_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
    }
  };

  template<typename t0, typename t1>
  static inline DivOp<t0, t1> div(t0 const& arg0, t1 const& arg1) {
    return DivOp<t0, t1>(arg0, arg1);
  }

  //

  class ConstOp {
    double c;

  public :
    ConstOp(const double& c) : c(c) {}
    double evald1(int index, double** args) const { return c; }
    vdouble evalvd(int index, double** args) const { return vcast_vd_d(c); }
    float evalf1(int index, float** args) const { return c; }
    vfloat evalvf(int index, float** args) const { return vcast_vf_f(c); }
  };

  static inline ConstOp constant(double c) { return ConstOp(c); }

  //

  class LoadOp {
    int n;

  public :
    LoadOp(int n) : n(n) {}

    double evald1(int index, double** args) const { return args[n][index]; }
    vdouble evalvd(int index, double** args) const { return vloadu_vd_p(&args[n][index]); }
    float evalf1(int index, float** args) const { return args[n][index]; }
    vfloat evalvf(int index, float** args) const { return vloadu_vf_p(&args[n][index]); }
  };

  static inline LoadOp load(int n) { return LoadOp(n); }

  //

  template<typename t0, typename t1, typename t2, typename t3>
  class SelLEOp {
    t0 const arg0;
    t1 const arg1;
    t2 const arg2;
    t3 const arg3;

  public:
    SelLEOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) :
      arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {}

    double evald1(int index, double** args) const {
      return arg0.evalvd(index, args) <= arg1.evalvd(index, args) ?
	arg2.evalvd(index, args) : arg3.evalvd(index, args);
    }

    vdouble evalvd(int index, double** args) const {
      return vsel_vd_vo_vd_vd(vle_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)),
			      arg2.evalvd(index, args), arg3.evalvd(index, args));
    }

    float evalf1(int index, float** args) const {
      return arg0.evalf1(index, args) <= arg1.evalf1(index, args) ?
	arg2.evalf1(index, args) : arg3.evalf1(index, args);
    }

    vfloat evalvf(int index, float** args) const {
      return vsel_vf_vo_vf_vf(vle_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)),
			      arg2.evalvf(index, args), arg3.evalvf(index, args));
    }
  };

  template<typename t0, typename t1, typename t2, typename t3>
  static inline SelLEOp<t0, t1, t2, t3> selle(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) {
    return SelLEOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3);
  }

  //

  template<typename t0, typename t1, typename t2, typename t3>
  class SelLTOp {
    t0 const arg0;
    t1 const arg1;
    t2 const arg2;
    t3 const arg3;

  public:
    SelLTOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) :
      arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {}

    double evald1(int index, double** args) const {
      return arg0.evalvd(index, args) < arg1.evalvd(index, args) ?
	arg2.evalvd(index, args) : arg3.evalvd(index, args);
    }

    vdouble evalvd(int index, double** args) const {
      return vsel_vd_vo_vd_vd(vlt_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)),
			      arg2.evalvd(index, args), arg3.evalvd(index, args));
    }

    float evalf1(int index, float** args) const {
      return arg0.evalf1(index, args) < arg1.evalf1(index, args) ?
	arg2.evalf1(index, args) : arg3.evalf1(index, args);
    }

    vfloat evalvf(int index, float** args) const {
      return vsel_vf_vo_vf_vf(vlt_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)),
			      arg2.evalvf(index, args), arg3.evalvf(index, args));
    }
  };

  template<typename t0, typename t1, typename t2, typename t3>
  static inline SelLTOp<t0, t1, t2, t3> sellt(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) {
    return SelLTOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3);
  }

  //

  static inline float negf(float x) { return -x; }
  static inline double neg(double x) { return -x; }
  static inline float absf(float x) { return x < 0 ? -x : x; }

#define UNARYFUNC(fnd, fnf, vfnd, vfnf)	\
  template<typename t0> class OpFunc_ ## fnd {	\
    t0 const arg0;				\
  public:					\
  OpFunc_ ## fnd(t0 const& arg0) : arg0(arg0) {}			\
  double evald1(int index, double** args) const { return fnd(arg0.evald1(index, args)); } \
  vdouble evalvd(int index, double** args) const { return vfnd(arg0.evalvd(index, args)); } \
  float evalf1(int index, float** args) const { return fnf(arg0.evalf1(index, args)); } \
  vfloat evalvf(int index, float** args) const { return vfnf(arg0.evalvf(index, args)); } \
  };									\
  template<typename t0> OpFunc_ ## fnd<t0> static inline fnd(t0 const& arg0) { return OpFunc_ ## fnd<t0>(arg0); }

UNARYFUNC(sqrt, sqrtf, vsqrt_vd_vd, vsqrt_vf_vf)
UNARYFUNC(abs, abs, vabs_vd_vd, vabs_vf_vf)
UNARYFUNC(max, maxf, vmax_vd_vd, vmax_vf_vf)
UNARYFUNC(min, minf, vmin_vd_vd, vmin_vf_vf)
UNARYFUNC(neg, negf, vneg_vd_vd, vneg_vf_vf)
UNARYFUNC(Sleef_exp_u10, Sleef_expf_u10, Sleef_expd4_u10avx2, Sleef_expf8_u10avx2)
UNARYFUNC(Sleef_log_u10, Sleef_logf_u10, Sleef_logd4_u10avx2, Sleef_logf8_u10avx2)
UNARYFUNC(Sleef_atan_u10, Sleef_atanf_u10, Sleef_atand4_u10avx2, Sleef_atanf8_u10avx2)

  //

  template<typename t0>
  OpFunc_neg<t0> operator-(t0 const& arg0) { return neg(arg0); }
  template<typename t0, typename t1>
  AddOp<t0, t1> operator+(t0 const& arg0, t1 const& arg1) { return add(arg0, arg1); }
  template<typename t0, typename t1>
  SubOp<t0, t1> operator-(t0 const& arg0, t1 const& arg1) { return sub(arg0, arg1); }
  template<typename t0, typename t1>
  MulOp<t0, t1> operator*(t0 const& arg0, t1 const& arg1) { return mul(arg0, arg1); }
  template<typename t0, typename t1>
  DivOp<t0, t1> operator/(t0 const& arg0, t1 const& arg1) { return div(arg0, arg1); }
  template<typename t0>
  AddOp<ConstOp, t0> operator+(double const& arg0, t0 const& arg1) { return add(constant(arg0), arg1); }
  template<typename t0>
  SubOp<ConstOp, t0> operator-(double const& arg0, t0 const& arg1) { return sub(constant(arg0), arg1); }
  template<typename t0>
  MulOp<ConstOp, t0> operator*(double const& arg0, t0 const& arg1) { return mul(constant(arg0), arg1); }
  template<typename t0>
  DivOp<ConstOp, t0> operator/(double const& arg0, t0 const& arg1) { return div(constant(arg0), arg1); }
  template<typename t0>
  AddOp<t0, ConstOp> operator+(t0 const& arg0, double const& arg1) { return add(arg0, constant(arg1)); }
  template<typename t0>
  SubOp<t0, ConstOp> operator-(t0 const& arg0, double const& arg1) { return sub(arg0, constant(arg1)); }
  template<typename t0>
  MulOp<t0, ConstOp> operator*(t0 const& arg0, double const& arg1) { return mul(arg0, constant(arg1)); }
  template<typename t0>
  DivOp<t0, ConstOp> operator/(t0 const& arg0, double const& arg1) { return div(arg0, constant(arg1)); }

  //

  template<typename t0>
  INLINE void executed(uint64_t mode, const size_t n, double **args, double *ret, t0 op) {
    int i;
#ifdef _OPENMP
    if ((mode & SleefVML_MT) != 0) {
#pragma omp parallel for
      for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args));
    } else
#endif
      {
      for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args));
    }
    for(;i<n;i++) ret[i] = op.evald1(i, args);
  }

  template<typename t0>
  INLINE void executef(uint64_t mode, const size_t n, float **args, float *ret, t0 op) {
    int i;
#ifdef _OPENMP
    if ((mode & SleefVML_MT) != 0) {
#pragma omp parallel for
      for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args));
    } else
#endif
      {
      for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args));
    }
    for(;i<n;i++) ret[i] = op.evalf1(i, args);
  }
}

//

using namespace SleefVML;

extern "C" void SleefVML_vmsBinaryStep(size_t n, float *arg0, float *ret, uint64_t mode) {
  float *args[] = { arg0 };
  executef(mode, n, args, ret, sellt(load(0), constant(0), constant(0), constant(1)));
}

extern "C" void SleefVML_vmsSigmoid(size_t n, float *arg0, float *ret, uint64_t mode) {
  float *args[] = { arg0 };
  executef(mode, n, args, ret, 1.0 / (1.0 + Sleef_exp_u10(-load(0))));
}

extern "C" void SleefVML_vmsSoftSign(size_t n, float *arg0, float *ret, uint64_t mode) {
  float *args[] = { arg0 };
  executef(mode, n, args, ret, load(0) / (1.0 + abs(load(0))));
}

extern "C" void SleefVML_vmsISRU(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) {
  float *args[] = { arg0 };
  executef(mode, n, args, ret, load(0) / sqrt(1.0 + alpha * (load(0) * load(0))));
}

extern "C" void SleefVML_vmsSoftExponential(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) {
  float *args[] = { arg0 };
  if (alpha < 0) {
    executef(mode, n, args, ret, -Sleef_log_u10(1.0-alpha*(load(0)+alpha))/alpha);
  } else {
    executef(mode, n, args, ret, (Sleef_exp_u10(alpha*load(0))-1.0)/alpha + alpha);
  }
}

extern "C" void SleefVML_vmsSoftClipping(size_t n, float *arg0, double alpha, float *ret, uint64_t mode) {
  float *args[] = { arg0 };
  executef(mode, n, args, ret, Sleef_log_u10((1.0+Sleef_exp_u10(alpha*load(0)))/(1.0+Sleef_exp_u10(alpha*(load(0)-1.0))))/alpha);
}

//

#include <stdlib.h>
#include <iostream>

double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }

int main(int argc, char **argv) {
  const int N = 100;
  float content[2][N], ret[N];

  for(int j=0;j<2;j++) {
    for(int i=0;i<N;i++) content[j][i] = (2.0 * rand() / RAND_MAX) - 1.0;
  }

  float *args[] = { &content[0][0], &content[1][0] };
  SleefVML_vmsSigmoid(N, args[0], ret, 0);

  for(int i=0;i<N;i++) std::cout << ret[i] << ", " << sigmoid(args[0][i]) << "\n";
}
	#include "sleef.h"

	#define INLINE inline

	// Below is a part of helperavx2.h

	#define VECTLENDP 4
	#define VECTLENSP 8

	typedef __m256i vmask;
	typedef __m256i vopmask;
	typedef __m256d vdouble;
	typedef __m128i vint;
	typedef __m256 vfloat;
	typedef __m256i vint2;

	static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
	static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
	static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); }
	static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
	static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
	static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }

	static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
	static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

	static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
	static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
	static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
	static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
	static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
	static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
	static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
	static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
	static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
	static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
	static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
	static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
	static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
	static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }

	static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
	static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
	static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
	static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
	static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
	static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }

	static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
	static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
	static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
	static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
	static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
	static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
	static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
	static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
	static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
	static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
	static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
	static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
	static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
	static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }

	//

	#include <cmath>
	#include <algorithm>

	#define SleefVML_HA 0 // Use 1-ulp accuracy functions (default)
	#define SleefVML_LA (1 << 0) // Use 3.5-ulp accuracy functions
	#define SleefVML_MT (1 << 2) // Multithread

	namespace SleefVML {
	template<typename t0, typename t1>
	class AddOp {
	t0 const arg0;
	t1 const arg1;

	public:
	AddOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

	double evald1(int index, double** args) const {
	return arg0.evald1(index, args) + arg1.evald1(index, args);
	}

	vdouble evalvd(int index, double** args) const {
	return vadd_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
	}

	float evalf1(int index, float** args) const {
	return arg0.evalf1(index, args) + arg1.evalf1(index, args);
	}

	vfloat evalvf(int index, float** args) const {
	return vadd_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
	}
	};

	template<typename t0, typename t1>
	static inline AddOp<t0, t1> add(t0 const& arg0, t1 const& arg1) {
	return AddOp<t0, t1>(arg0, arg1);
	}

	//

	template<typename t0, typename t1>
	class SubOp {
	t0 const arg0;
	t1 const arg1;

	public:
	SubOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

	double evald1(int index, double** args) const {
	return arg0.evald1(index, args) - arg1.evald1(index, args);
	}

	vdouble evalvd(int index, double** args) const {
	return vsub_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
	}

	float evalf1(int index, float** args) const {
	return arg0.evalf1(index, args) - arg1.evalf1(index, args);
	}

	vfloat evalvf(int index, float** args) const {
	return vsub_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
	}
	};

	template<typename t0, typename t1>
	static inline SubOp<t0, t1> sub(t0 const& arg0, t1 const& arg1) {
	return SubOp<t0, t1>(arg0, arg1);
	}

	//

	template<typename t0, typename t1>
	class MulOp {
	t0 const arg0;
	t1 const arg1;

	public:
	MulOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

	double evald1(int index, double** args) const {
	return arg0.evald1(index, args) * arg1.evald1(index, args);
	}

	vdouble evalvd(int index, double** args) const {
	return vmul_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
	}

	float evalf1(int index, float** args) const {
	return arg0.evalf1(index, args) * arg1.evalf1(index, args);
	}

	vfloat evalvf(int index, float** args) const {
	return vmul_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
	}
	};

	template<typename t0, typename t1>
	static inline MulOp<t0, t1> mul(t0 const& arg0, t1 const& arg1) {
	return MulOp<t0, t1>(arg0, arg1);
	}

	//

	template<typename t0, typename t1>
	class DivOp {
	t0 const arg0;
	t1 const arg1;

	public:
	DivOp(t0 const& arg0, t1 const& arg1) : arg0(arg0), arg1(arg1) {}

	double evald1(int index, double** args) const {
	return arg0.evald1(index, args) / arg1.evald1(index, args);
	}

	vdouble evalvd(int index, double** args) const {
	return vdiv_vd_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args));
	}

	float evalf1(int index, float** args) const {
	return arg0.evalf1(index, args) / arg1.evalf1(index, args);
	}

	vfloat evalvf(int index, float** args) const {
	return vdiv_vf_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args));
	}
	};

	template<typename t0, typename t1>
	static inline DivOp<t0, t1> div(t0 const& arg0, t1 const& arg1) {
	return DivOp<t0, t1>(arg0, arg1);
	}

	//

	class ConstOp {
	double c;

	public :
	ConstOp(const double& c) : c(c) {}
	double evald1(int index, double** args) const { return c; }
	vdouble evalvd(int index, double** args) const { return vcast_vd_d(c); }
	float evalf1(int index, float** args) const { return c; }
	vfloat evalvf(int index, float** args) const { return vcast_vf_f(c); }
	};

	static inline ConstOp constant(double c) { return ConstOp(c); }

	//

	class LoadOp {
	int n;

	public :
	LoadOp(int n) : n(n) {}

	double evald1(int index, double** args) const { return args[n][index]; }
	vdouble evalvd(int index, double** args) const { return vloadu_vd_p(&args[n][index]); }
	float evalf1(int index, float** args) const { return args[n][index]; }
	vfloat evalvf(int index, float** args) const { return vloadu_vf_p(&args[n][index]); }
	};

	static inline LoadOp load(int n) { return LoadOp(n); }

	//

	template<typename t0, typename t1, typename t2, typename t3>
	class SelLEOp {
	t0 const arg0;
	t1 const arg1;
	t2 const arg2;
	t3 const arg3;

	public:
	SelLEOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) :
	arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {}

	double evald1(int index, double** args) const {
	return arg0.evalvd(index, args) <= arg1.evalvd(index, args) ?
	arg2.evalvd(index, args) : arg3.evalvd(index, args);
	}

	vdouble evalvd(int index, double** args) const {
	return vsel_vd_vo_vd_vd(vle_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)),
	arg2.evalvd(index, args), arg3.evalvd(index, args));
	}

	float evalf1(int index, float** args) const {
	return arg0.evalf1(index, args) <= arg1.evalf1(index, args) ?
	arg2.evalf1(index, args) : arg3.evalf1(index, args);
	}

	vfloat evalvf(int index, float** args) const {
	return vsel_vf_vo_vf_vf(vle_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)),
	arg2.evalvf(index, args), arg3.evalvf(index, args));
	}
	};

	template<typename t0, typename t1, typename t2, typename t3>
	static inline SelLEOp<t0, t1, t2, t3> selle(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) {
	return SelLEOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3);
	}

	//

	template<typename t0, typename t1, typename t2, typename t3>
	class SelLTOp {
	t0 const arg0;
	t1 const arg1;
	t2 const arg2;
	t3 const arg3;

	public:
	SelLTOp(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) :
	arg0(arg0), arg1(arg1), arg2(arg2), arg3(arg3) {}

	double evald1(int index, double** args) const {
	return arg0.evalvd(index, args) < arg1.evalvd(index, args) ?
	arg2.evalvd(index, args) : arg3.evalvd(index, args);
	}

	vdouble evalvd(int index, double** args) const {
	return vsel_vd_vo_vd_vd(vlt_vo_vd_vd(arg0.evalvd(index, args), arg1.evalvd(index, args)),
	arg2.evalvd(index, args), arg3.evalvd(index, args));
	}

	float evalf1(int index, float** args) const {
	return arg0.evalf1(index, args) < arg1.evalf1(index, args) ?
	arg2.evalf1(index, args) : arg3.evalf1(index, args);
	}

	vfloat evalvf(int index, float** args) const {
	return vsel_vf_vo_vf_vf(vlt_vo_vf_vf(arg0.evalvf(index, args), arg1.evalvf(index, args)),
	arg2.evalvf(index, args), arg3.evalvf(index, args));
	}
	};

	template<typename t0, typename t1, typename t2, typename t3>
	static inline SelLTOp<t0, t1, t2, t3> sellt(t0 const& arg0, t1 const& arg1, t2 const& arg2, t3 const& arg3) {
	return SelLTOp<t0, t1, t2, t3>(arg0, arg1, arg2, arg3);
	}

	//

	static inline float negf(float x) { return -x; }
	static inline double neg(double x) { return -x; }
	static inline float absf(float x) { return x < 0 ? -x : x; }

	#define UNARYFUNC(fnd, fnf, vfnd, vfnf) \
	template<typename t0> class OpFunc_ ## fnd { \
	t0 const arg0; \
	public: \
	OpFunc_ ## fnd(t0 const& arg0) : arg0(arg0) {} \
	double evald1(int index, double** args) const { return fnd(arg0.evald1(index, args)); } \
	vdouble evalvd(int index, double** args) const { return vfnd(arg0.evalvd(index, args)); } \
	float evalf1(int index, float** args) const { return fnf(arg0.evalf1(index, args)); } \
	vfloat evalvf(int index, float** args) const { return vfnf(arg0.evalvf(index, args)); } \
	}; \
	template<typename t0> OpFunc_ ## fnd<t0> static inline fnd(t0 const& arg0) { return OpFunc_ ## fnd<t0>(arg0); }

	UNARYFUNC(sqrt, sqrtf, vsqrt_vd_vd, vsqrt_vf_vf)
	UNARYFUNC(abs, abs, vabs_vd_vd, vabs_vf_vf)
	UNARYFUNC(max, maxf, vmax_vd_vd, vmax_vf_vf)
	UNARYFUNC(min, minf, vmin_vd_vd, vmin_vf_vf)
	UNARYFUNC(neg, negf, vneg_vd_vd, vneg_vf_vf)
	UNARYFUNC(Sleef_exp_u10, Sleef_expf_u10, Sleef_expd4_u10avx2, Sleef_expf8_u10avx2)
	UNARYFUNC(Sleef_log_u10, Sleef_logf_u10, Sleef_logd4_u10avx2, Sleef_logf8_u10avx2)
	UNARYFUNC(Sleef_atan_u10, Sleef_atanf_u10, Sleef_atand4_u10avx2, Sleef_atanf8_u10avx2)

	//

	template<typename t0>
	OpFunc_neg<t0> operator-(t0 const& arg0) { return neg(arg0); }
	template<typename t0, typename t1>
	AddOp<t0, t1> operator+(t0 const& arg0, t1 const& arg1) { return add(arg0, arg1); }
	template<typename t0, typename t1>
	SubOp<t0, t1> operator-(t0 const& arg0, t1 const& arg1) { return sub(arg0, arg1); }
	template<typename t0, typename t1>
	MulOp<t0, t1> operator*(t0 const& arg0, t1 const& arg1) { return mul(arg0, arg1); }
	template<typename t0, typename t1>
	DivOp<t0, t1> operator/(t0 const& arg0, t1 const& arg1) { return div(arg0, arg1); }
	template<typename t0>
	AddOp<ConstOp, t0> operator+(double const& arg0, t0 const& arg1) { return add(constant(arg0), arg1); }
	template<typename t0>
	SubOp<ConstOp, t0> operator-(double const& arg0, t0 const& arg1) { return sub(constant(arg0), arg1); }
	template<typename t0>
	MulOp<ConstOp, t0> operator*(double const& arg0, t0 const& arg1) { return mul(constant(arg0), arg1); }
	template<typename t0>
	DivOp<ConstOp, t0> operator/(double const& arg0, t0 const& arg1) { return div(constant(arg0), arg1); }
	template<typename t0>
	AddOp<t0, ConstOp> operator+(t0 const& arg0, double const& arg1) { return add(arg0, constant(arg1)); }
	template<typename t0>
	SubOp<t0, ConstOp> operator-(t0 const& arg0, double const& arg1) { return sub(arg0, constant(arg1)); }
	template<typename t0>
	MulOp<t0, ConstOp> operator*(t0 const& arg0, double const& arg1) { return mul(arg0, constant(arg1)); }
	template<typename t0>
	DivOp<t0, ConstOp> operator/(t0 const& arg0, double const& arg1) { return div(arg0, constant(arg1)); }

	//

	template<typename t0>
	INLINE void executed(uint64_t mode, const size_t n, double *args, double ret, t0 op) {
	int i;
	#ifdef _OPENMP
	if ((mode & SleefVML_MT) != 0) {
	#pragma omp parallel for
	for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args));
	} else
	#endif
	{
	for(i=0;i<=n-VECTLENDP;i += VECTLENDP) vstoreu_v_p_vd(&ret[i], op.evalvd(i, args));
	}
	for(;i<n;i++) ret[i] = op.evald1(i, args);
	}

	template<typename t0>
	INLINE void executef(uint64_t mode, const size_t n, float *args, float ret, t0 op) {
	int i;
	#ifdef _OPENMP
	if ((mode & SleefVML_MT) != 0) {
	#pragma omp parallel for
	for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args));
	} else
	#endif
	{
	for(i=0;i<=n-VECTLENSP;i += VECTLENSP) vstoreu_v_p_vf(&ret[i], op.evalvf(i, args));
	}
	for(;i<n;i++) ret[i] = op.evalf1(i, args);
	}
	}

	//

	using namespace SleefVML;

	extern "C" void SleefVML_vmsBinaryStep(size_t n, float arg0, float ret, uint64_t mode) {
	float *args[] = { arg0 };
	executef(mode, n, args, ret, sellt(load(0), constant(0), constant(0), constant(1)));
	}

	extern "C" void SleefVML_vmsSigmoid(size_t n, float arg0, float ret, uint64_t mode) {
	float *args[] = { arg0 };
	executef(mode, n, args, ret, 1.0 / (1.0 + Sleef_exp_u10(-load(0))));
	}

	extern "C" void SleefVML_vmsSoftSign(size_t n, float arg0, float ret, uint64_t mode) {
	float *args[] = { arg0 };
	executef(mode, n, args, ret, load(0) / (1.0 + abs(load(0))));
	}

	extern "C" void SleefVML_vmsISRU(size_t n, float arg0, double alpha, float ret, uint64_t mode) {
	float *args[] = { arg0 };
	executef(mode, n, args, ret, load(0) / sqrt(1.0 + alpha * (load(0) * load(0))));
	}

	extern "C" void SleefVML_vmsSoftExponential(size_t n, float arg0, double alpha, float ret, uint64_t mode) {
	float *args[] = { arg0 };
	if (alpha < 0) {
	executef(mode, n, args, ret, -Sleef_log_u10(1.0-alpha*(load(0)+alpha))/alpha);
	} else {
	executef(mode, n, args, ret, (Sleef_exp_u10(alpha*load(0))-1.0)/alpha + alpha);
	}
	}

	extern "C" void SleefVML_vmsSoftClipping(size_t n, float arg0, double alpha, float ret, uint64_t mode) {
	float *args[] = { arg0 };
	executef(mode, n, args, ret, Sleef_log_u10((1.0+Sleef_exp_u10(alphaload(0)))/(1.0+Sleef_exp_u10(alpha(load(0)-1.0))))/alpha);
	}

	//

	#include <stdlib.h>
	#include <iostream>

	double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }

	int main(int argc, char **argv) {
	const int N = 100;
	float content[2][N], ret[N];

	for(int j=0;j<2;j++) {
	for(int i=0;i<N;i++) content[j][i] = (2.0 * rand() / RAND_MAX) - 1.0;
	}

	float *args[] = { &content[0][0], &content[1][0] };
	SleefVML_vmsSigmoid(N, args[0], ret, 0);

	for(int i=0;i<N;i++) std::cout << ret[i] << ", " << sigmoid(args[0][i]) << "\n";
	}