Skip to content

Instantly share code, notes, and snippets.

@goldshtn
Last active March 23, 2018 13:28
Show Gist options
  • Save goldshtn/a1cf8a1d3007bf731224 to your computer and use it in GitHub Desktop.
Save goldshtn/a1cf8a1d3007bf731224 to your computer and use it in GitHub Desktop.
Vectorized dot product of float arrays in C# and C++
; C++ core loop, key intrinsic is _mm_dp_ps
00007fff`548b10d5 0f100c0a movups xmm1,xmmword ptr [rdx+rcx] ; LOOP
00007fff`548b10d9 0f1011 movups xmm2,xmmword ptr [rcx]
00007fff`548b10dc 4883c110 add rcx,10h
00007fff`548b10e0 660f3a40d1f1 dpps xmm2,xmm1,0F1h
00007fff`548b10e6 f30f58c2 addss xmm0,xmm2
00007fff`548b10ea 4983e801 sub r8,1
00007fff`548b10ee 75e5 jne 00007fff`548b10d5 ; LOOP
; C# core loop, key intrinsic is Vector.Dot(Vector<float>, Vector<float>);
; runs 70% slower than C++ version
00007ffe`fc663686 448d5003 lea r10d,[rax+3] ; LOOP
00007ffe`fc66368a 453bd0 cmp r10d,r8d
00007ffe`fc66368d 7337 jae RANGE_FAIL
00007ffe`fc66368f 0f104c8110 movups xmm1,xmmword ptr [rcx+rax*4+10h]
00007ffe`fc663694 453bd1 cmp r10d,r9d
00007ffe`fc663697 732d jae RANGE_FAIL
00007ffe`fc663699 0f10548210 movups xmm2,xmmword ptr [rdx+rax*4+10h]
00007ffe`fc66369e 0f59ca mulps xmm1,xmm2
00007ffe`fc6636a1 0f28d9 movaps xmm3,xmm1
00007ffe`fc6636a4 0fc6dbb1 shufps xmm3,xmm3,0B1h
00007ffe`fc6636a8 0f58cb addps xmm1,xmm3
00007ffe`fc6636ab 0f28d9 movaps xmm3,xmm1
00007ffe`fc6636ae 0fc6db1b shufps xmm3,xmm3,1Bh
00007ffe`fc6636b2 0f58cb addps xmm1,xmm3
00007ffe`fc6636b5 f30f58c1 addss xmm0,xmm1
00007ffe`fc6636b9 83c004 add eax,4
00007ffe`fc6636bc 443bc0 cmp r8d,eax
00007ffe`fc6636bf 7fc5 jg 00007ffe`fc663686 ; LOOP
extern "C" __declspec(dllexport) float vectorized_native_same_type_float(float* a, float* b, int length)
{
float aux = 0;
for (int i = 0; i < length; i += 4)
{
__m128 va = _mm_loadu_ps(a + i);
__m128 vb = _mm_loadu_ps(b + i);
__m128 dp = _mm_dp_ps(va, vb, 0xF1);
aux += dp.m128_f32[0];
}
return aux;
}
static float VectorizedSameTypeFloat(float[] a, float[] b)
{
float aux = 0;
int vecSize = Vector<float>.Count;
for (int i = 0; i < a.Length; i += vecSize)
{
Vector<float> va = new Vector<float>(a, i);
Vector<float> vb = new Vector<float>(b, i);
aux += Vector.Dot(va, vb);
}
return aux;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment