Skip to content

Instantly share code, notes, and snippets.

@nemequ
Created July 7, 2020 02:38
Show Gist options
  • Save nemequ/727e4da1116779ba89369f47b534ce89 to your computer and use it in GitHub Desktop.
Save nemequ/727e4da1116779ba89369f47b534ce89 to your computer and use it in GitHub Desktop.
/* This is the process I used to vectorize _mm_cdfnorminv_ps in
* SIMDe. The first function is the most direct port. The second
* function merges a couple of the branches together. The third
* merges the else branch and is the fastest, but also the most
* complex.
*
* Hopefully seeing the different versions will help make it a bit
* clearer how I got to the final version. */
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_cdfnorminv_ps (simde__m128 a) {
#if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE)
return _mm_cdfnorminv_ps(a);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
const simde__m128 c_c[] = {
simde_mm_set1_ps(SIMDE_FLOAT32_C(-7.784894002430293e-03)),
simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.223964580411365e-01)),
simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.400758277161838e+00)),
simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.549732539343734e+00)),
simde_mm_set1_ps(SIMDE_FLOAT32_C( 4.374664141464968e+00)),
simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.938163982698783e+00))
};
const simde__m128 c_d[] = {
simde_mm_set1_ps(SIMDE_FLOAT32_C( 7.784695709041462e-03)),
simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.224671290700398e-01)),
simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.445134137142996e+00)),
simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.754408661907416e+00)),
};
simde__m128 matched, retval = simde_mm_setzero_ps();
{ /* if (a < 0 || a > 1) */
matched = simde_mm_or_ps(simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))), simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0))));
/* We don't actually need to do anything here since we initialize
* retval to 0.0. */
}
{ /* else if (a == 0) */
simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0)));
mask = simde_mm_andnot_ps(matched, mask);
matched = simde_mm_or_ps(matched, mask);
simde__m128 res = simde_mm_set1_ps(-SIMDE_MATH_INFINITYF);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
{ /* else if (a == 1) */
simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
mask = simde_mm_andnot_ps(matched, mask);
matched = simde_mm_or_ps(matched, mask);
simde__m128 res = simde_mm_set1_ps(SIMDE_MATH_INFINITYF);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
{ /* else if (a < 0.02425) */
simde__m128 mask = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.02425)));
mask = simde_mm_andnot_ps(matched, mask);
if (!simde_mm_test_all_zeros(simde_mm_castps_si128(mask), simde_x_mm_setone_si128())) {
matched = simde_mm_or_ps(matched, mask);
/* q = simde_math_sqrtf(-2.0f * simde_math_logf(a)); */
simde__m128 q = simde_mm_log_ps(a);
q = simde_mm_mul_ps(q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.0)));
q = simde_mm_sqrt_ps(q);
/* float numerator = (((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]); */
simde__m128 numerator = simde_mm_fmadd_ps(c_c[0], q, c_c[1]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[2]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[3]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[4]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[5]);
/* float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * q + 1)); */
simde__m128 denominator = simde_mm_fmadd_ps(c_d[0], q, c_d[1]);
denominator = simde_mm_fmadd_ps(denominator, q, c_d[2]);
denominator = simde_mm_fmadd_ps(denominator, q, c_d[3]);
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
/* res = numerator / denominator; */
simde__m128 res = simde_mm_div_ps(numerator, denominator);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
}
{ /* else if (a > 0.97575) */
simde__m128 mask = simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.97575)));
mask = simde_mm_andnot_ps(matched, mask);
if (!simde_mm_test_all_zeros(simde_mm_castps_si128(mask), simde_x_mm_setone_si128())) {
matched = simde_mm_or_ps(matched, mask);
/* q = simde_math_sqrtf(-2.0f * simde_math_logf(1.0f - p)); */
simde__m128 q = simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0));
q = simde_mm_sub_ps(q, a);
q = simde_mm_log_ps(q);
q = simde_mm_mul_ps(q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.0)));
q = simde_mm_sqrt_ps(q);
/* float numerator = -(((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]); */
simde__m128 numerator = simde_mm_fmadd_ps(c_c[0], q, c_c[1]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[2]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[3]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[4]);
numerator = simde_mm_fmadd_ps(numerator, q, c_c[5]);
numerator = simde_x_mm_negate_ps(numerator);
/* float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * q + 1)); */
simde__m128 denominator = simde_mm_fmadd_ps(c_d[0], q, c_d[1]);
denominator = simde_mm_fmadd_ps(denominator, q, c_d[2]);
denominator = simde_mm_fmadd_ps(denominator, q, c_d[3]);
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
/* res = numerator / denominator; */
simde__m128 res = simde_mm_div_ps(numerator, denominator);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
}
{ /* else */
if (!simde_mm_test_all_ones(simde_mm_castps_si128(matched))) {
simde__m128 mask = simde_x_mm_not_ps(matched);
/* q = a - 0.5f */
simde__m128 q = simde_mm_sub_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.5)));
/* r = q * q */
simde__m128 r = simde_mm_mul_ps(q, q);
/* float numerator = ((((((c_a[0] * r + c_a[1]) * r + c_a[2]) * r + c_a[3]) * r + c_a[4]) * r + c_a[5]) * q) */
simde__m128 numerator = simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.969683028665376e+01));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.209460984245205e+02)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.759285104469687e+02)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.383577518672690e+02)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.066479806614716e+01)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.506628277459239e+00)));
numerator = simde_mm_mul_ps(numerator, q);
/* float denominator = (((((c_b[0] * r + c_b[1]) * r + c_b[2]) * r + c_b[3]) * r + c_b[4]) * r + 1) */
simde__m128 denominator = simde_mm_set1_ps(SIMDE_FLOAT32_C(-5.447609879822406e+01));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.615858368580409e+02)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.556989798598866e+02)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 6.680131188771972e+01)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.328068155288572e+01)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
/* res = numerator / denominator */
simde__m128 res = simde_mm_div_ps(numerator, denominator);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
}
return retval;
#else
simde__m128_private
r_,
a_ = simde__m128_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_cdfnorminvf(a_.f32[i]);
}
return simde__m128_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES)
#undef _mm_cdfnorminv_ps
#define _mm_cdfnorminv_ps(a) simde_mm_cdfnorminv_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_cdfnorminv_ps (simde__m128 a) {
#if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE)
return _mm_cdfnorminv_ps(a);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
simde__m128 matched, retval = simde_mm_setzero_ps();
{ /* if (a < 0 || a > 1) */
matched = simde_mm_or_ps(simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))), simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0))));
/* We don't actually need to do anything here since we initialize
* retval to 0.0. */
}
{ /* else if (a == 0) */
simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0)));
mask = simde_mm_andnot_ps(matched, mask);
matched = simde_mm_or_ps(matched, mask);
simde__m128 res = simde_mm_set1_ps(-SIMDE_MATH_INFINITYF);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
{ /* else if (a == 1) */
simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
mask = simde_mm_andnot_ps(matched, mask);
matched = simde_mm_or_ps(matched, mask);
simde__m128 res = simde_mm_set1_ps(SIMDE_MATH_INFINITYF);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
{ /* The next two conditions share a lot of code; they only differ
* when calculating q, and possibly negating the numerator. We
* handle them together to avoid duplicating work. */
/* else if (a < 0.02425) */
simde__m128 mask_lo = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.02425)));
/* else if (a > 0.97575) */
simde__m128 mask_hi = simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.97575)));
simde__m128 mask = simde_mm_or_ps(mask_lo, mask_hi);
if (!simde_mm_test_all_zeros(simde_mm_castps_si128(mask), simde_x_mm_setone_si128())) {
matched = simde_mm_or_ps(matched, mask);
/* lo: q = a
* hi: q = (1.0 - a) */
simde__m128 q = simde_mm_sub_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)), a);
q = simde_mm_and_ps(mask_hi, q);
q = simde_mm_or_ps(q, simde_mm_andnot_ps(mask_hi, a));
/* q = simde_math_sqrtf(-2.0f * simde_math_logf(q)) */
q = simde_mm_log_ps(q);
q = simde_mm_mul_ps(q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.0)));
q = simde_mm_sqrt_ps(q);
/* float numerator = (((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]); */
simde__m128 numerator = simde_mm_set1_ps(SIMDE_FLOAT32_C(-7.784894002430293e-03));
numerator = simde_mm_fmadd_ps(numerator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.223964580411365e-01)));
numerator = simde_mm_fmadd_ps(numerator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.400758277161838e+00)));
numerator = simde_mm_fmadd_ps(numerator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.549732539343734e+00)));
numerator = simde_mm_fmadd_ps(numerator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C( 4.374664141464968e+00)));
numerator = simde_mm_fmadd_ps(numerator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.938163982698783e+00)));
{ /* Negate numerator for hi branch */
simde__m128 multiplier = simde_mm_or_ps(
simde_mm_andnot_ps(mask_hi, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.0))),
simde_mm_and_ps (mask_hi, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.0))));
numerator = simde_mm_mul_ps(multiplier, numerator);
}
/* float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * q + 1)); */
simde__m128 denominator = simde_mm_set1_ps(SIMDE_FLOAT32_C( 7.784695709041462e-03));
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.224671290700398e-01)));
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.445134137142996e+00)));
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.754408661907416e+00)));
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
/* res = numerator / denominator; */
simde__m128 res = simde_mm_div_ps(numerator, denominator);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
}
{ /* else */
if (!simde_mm_test_all_ones(simde_mm_castps_si128(matched))) {
simde__m128 mask = simde_x_mm_not_ps(matched);
/* q = a - 0.5f */
simde__m128 q = simde_mm_sub_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.5)));
/* r = q * q */
simde__m128 r = simde_mm_mul_ps(q, q);
/* float numerator = ((((((c_a[0] * r + c_a[1]) * r + c_a[2]) * r + c_a[3]) * r + c_a[4]) * r + c_a[5]) * q) */
simde__m128 numerator = simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.969683028665376e+01));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.209460984245205e+02)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.759285104469687e+02)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.383577518672690e+02)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.066479806614716e+01)));
numerator = simde_mm_fmadd_ps(numerator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.506628277459239e+00)));
numerator = simde_mm_mul_ps(numerator, q);
/* float denominator = (((((c_b[0] * r + c_b[1]) * r + c_b[2]) * r + c_b[3]) * r + c_b[4]) * r + 1) */
simde__m128 denominator = simde_mm_set1_ps(SIMDE_FLOAT32_C(-5.447609879822406e+01));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.615858368580409e+02)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.556989798598866e+02)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C( 6.680131188771972e+01)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.328068155288572e+01)));
denominator = simde_mm_fmadd_ps(denominator, r, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
/* res = numerator / denominator */
simde__m128 res = simde_mm_div_ps(numerator, denominator);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
}
return retval;
#else
simde__m128_private
r_,
a_ = simde__m128_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_cdfnorminvf(a_.f32[i]);
}
return simde__m128_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES)
#undef _mm_cdfnorminv_ps
#define _mm_cdfnorminv_ps(a) simde_mm_cdfnorminv_ps(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
simde__m128
simde_mm_cdfnorminv_ps (simde__m128 a) {
#if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE)
return _mm_cdfnorminv_ps(a);
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
simde__m128 matched, retval = simde_mm_setzero_ps();
{ /* if (a < 0 || a > 1) */
matched = simde_mm_or_ps(simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))), simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0))));
/* We don't actually need to do anything here since we initialize
* retval to 0.0. */
}
{ /* else if (a == 0) */
simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0)));
mask = simde_mm_andnot_ps(matched, mask);
matched = simde_mm_or_ps(matched, mask);
simde__m128 res = simde_mm_set1_ps(-SIMDE_MATH_INFINITYF);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
{ /* else if (a == 1) */
simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
mask = simde_mm_andnot_ps(matched, mask);
matched = simde_mm_or_ps(matched, mask);
simde__m128 res = simde_mm_set1_ps(SIMDE_MATH_INFINITYF);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
{ /* Remaining conditions.
*
* Including the else case in this complicates things a lot, but
* we're using cheap operations to get rid of expensive multiply
* and add functions. This should be a small improvement on SSE
* prior to 4.1. On SSE 4.1 we can use _mm_blendv_ps which is
* very fast and this becomes a huge win. NEON, AltiVec, and
* WASM also have blend operations, so this should be a big win
* there, too. */
/* else if (a < 0.02425) */
simde__m128 mask_lo = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.02425)));
/* else if (a > 0.97575) */
simde__m128 mask_hi = simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.97575)));
simde__m128 mask = simde_mm_or_ps(mask_lo, mask_hi);
matched = simde_mm_or_ps(matched, mask);
/* else */
simde__m128 mask_el = simde_x_mm_not_ps(matched);
mask = simde_mm_or_ps(mask, mask_el);
/* r = a - 0.5f */
simde__m128 r = simde_mm_sub_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.5)));
/* lo: q = a
* hi: q = (1.0 - a) */
simde__m128 q = simde_mm_and_ps(mask_lo, a);
q = simde_mm_or_ps(q, simde_mm_and_ps(mask_hi, simde_mm_sub_ps(simde_mm_set1_ps(1.0f), a)));
/* q = simde_math_sqrtf(-2.0f * simde_math_logf(q)) */
q = simde_mm_log_ps(q);
q = simde_mm_mul_ps(q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.0)));
q = simde_mm_sqrt_ps(q);
/* el: q = r * r */
q = simde_x_mm_select_ps(q, simde_mm_mul_ps(r, r), mask_el);
/* lo: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0f); */
/* hi: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0f); */
/* el: float numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */
simde__m128 numerator = simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-7.784894002430293e-03)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.969683028665376e+01)), mask_el);
numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.223964580411365e-01)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.209460984245205e+02)), mask_el));
numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.400758277161838e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.759285104469687e+02)), mask_el));
numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.549732539343734e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.383577518672690e+02)), mask_el));
numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 4.374664141464968e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.066479806614716e+01)), mask_el));
numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.938163982698783e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.506628277459239e+00)), mask_el));
{
simde__m128 multiplier;
multiplier = simde_mm_and_ps(mask_lo, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.0)));
multiplier = simde_mm_or_ps(multiplier, simde_mm_and_ps(mask_hi, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.0))));
multiplier = simde_mm_or_ps(multiplier, simde_mm_and_ps(mask_el, r));
numerator = simde_mm_mul_ps(numerator, multiplier);
}
/* lo/hi: float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */
/* el: float denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */
simde__m128 denominator = simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 7.784695709041462e-03)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-5.447609879822406e+01)), mask_el);
denominator = simde_mm_fmadd_ps(denominator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.224671290700398e-01)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.615858368580409e+02)), mask_el));
denominator = simde_mm_fmadd_ps(denominator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.445134137142996e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.556989798598866e+02)), mask_el));
denominator = simde_mm_fmadd_ps(denominator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.754408661907416e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 6.680131188771972e+01)), mask_el));
denominator = simde_mm_fmadd_ps(denominator, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.0)), q, mask_el),
simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.0)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.328068155288572e+01)), mask_el));
denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)));
/* res = numerator / denominator; */
simde__m128 res = simde_mm_div_ps(numerator, denominator);
retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res));
}
return retval;
#else
simde__m128_private
r_,
a_ = simde__m128_to_private(a);
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.f32[i] = simde_math_cdfnorminvf(a_.f32[i]);
}
return simde__m128_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES)
#undef _mm_cdfnorminv_ps
#define _mm_cdfnorminv_ps(a) simde_mm_cdfnorminv_ps(a)
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment