Skip to content

Instantly share code, notes, and snippets.

@rikusalminen
Created July 3, 2012 14:55
Show Gist options
  • Save rikusalminen/3040241 to your computer and use it in GitHub Desktop.
Save rikusalminen/3040241 to your computer and use it in GitHub Desktop.
SIMD dot products: ARM NEON, SSE3, SSE
#if defined(__ARM_NEON__)
vec4 dot(vec4 a, vec4 b)
{
vec4 prod = vmulq_f32(a, b);
vec4 sum1 = vaddq_f32(prod, vrev64q_f32(prod));
vec4 sum2 = vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
return sum2;
}
#else if defined(__SSE3__)
static inline vec4 vdot(vec4 x, vec4 y)
{
vec4 prod = x * y;
vec4 sum1 = _mm_hadd_ps(prod, prod);
vec4 sum2 = _mm_hadd_ps(sum1, sum1);
return sum2;
}
#else // SSE
static inline vec4 vdot(vec4 x, vec4 y)
{
vec4 prod = x * y;
vec4 sum1 = prod + vshuffle(prod, prod, 1, 0, 3, 2);
vec4 sum2 = sum1 + vshuffle(sum1, sum1, 2, 2, 0, 0);
return sum2;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment