This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rcx,[rcx-0Ch] | |
lea rax,[rax-4] | |
vmovups xmm0,xmmword ptr [rcx] | |
vdpps xmm0,xmm0,xmm0,71h | |
vmovss dword ptr [rax],xmm0 | |
sub edx,1 | |
jne benchmark<<lambda_d1ac89d5e59a169233af7a419374e043>,<lambda_c092a5680821d9f5b8bc5a7043f59100> >+0C0h (07FF7B83F2100h) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_vector_dp(const vector<Vec3f>& vs, vector<float>& dp) { | |
// 0000 0000 0111 0001: mul lower three components, store sum in lowest component | |
static const auto mask = 0x71; | |
for (auto j = 0; j < reps; ++j) { | |
const auto pvs = (float*)vs.data(); | |
auto pdp = (float*)dp.data(); | |
auto i = vector_len; | |
while (i--) { | |
// load 16 bytes (xyz|x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea rax,[rax-0Ch] | |
lea rdx,[rdx-4] | |
vmovss xmm0,dword ptr [rax-8] | |
vmovss xmm2,dword ptr [rax-4] | |
vmovss xmm3,dword ptr [rax] | |
vmulss xmm1,xmm0,xmm0 | |
vmulss xmm0,xmm2,xmm2 | |
vaddss xmm2,xmm1,xmm0 | |
vmulss xmm1,xmm3,xmm3 | |
vaddss xmm2,xmm2,xmm1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_scalar(const vector<Vec3f>& vs, vector<float>& dp) { | |
for (auto j = 0; j < reps; ++j) { | |
auto i = vector_len; | |
while (i--) { | |
dp[i] = vs[i].x * vs[i].x + vs[i].y * vs[i].y + vs[i].z * vs[i].z; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lea ebp,[r11+7] | |
cmp ebp,esi | |
jae 00007FF98D06828A | |
vmovupd ymm0,ymmword ptr [rcx+r11*4+10h] | |
cmp ebp,edi | |
jae 00007FF98D06828A | |
vmovupd ymm1,ymmword ptr [rdx+r11*4+10h] | |
cmp ebp,ebx | |
jae 00007FF98D06828A | |
vmovupd ymm2,ymmword ptr [r8+r11*4+10h] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static void Dot3SoaVectorized(float[] xs, float[] ys, float[] zs, float[] dp) { | |
for (var j = 0; j < reps; ++j) { | |
for (var i = 0; i < dp.Length; i += laneWidth) { | |
var x = new Vector<float>(xs, i); | |
var y = new Vector<float>(ys, i); | |
var z = new Vector<float>(zs, i); | |
var d = x * x + y * y + z * z; | |
d.CopyTo(dp, i); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
movsxd rsi,r11d | |
vmovss xmm0,dword ptr [rcx+rsi*4+10h] | |
vmulss xmm0,xmm0,xmm0 | |
movsxd rsi,r11d | |
vmovss xmm1,dword ptr [rdx+rsi*4+10h] | |
vmulss xmm1,xmm1,xmm1 | |
vaddss xmm0,xmm0,xmm1 | |
movsxd rsi,r11d | |
vmovss xmm1,dword ptr [r8+rsi*4+10h] | |
vmulss xmm1,xmm1,xmm1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static void Dot3SoaScalar(float[] xs, float[] ys, float[] zs, float[] dp) { | |
for (var j = 0; j < reps; ++j) { | |
for (var i = 0; i < dp.Length; ++i) { | |
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i]; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xor r9d,r9d | |
mov r10d,dword ptr [rbx+8] | |
movsxd r10,r10d | |
cmp r10,8 | |
setge r10b | |
movzx r10d,r10b | |
mov r11d,dword ptr [rbp+8] | |
movsxd r11,r11d | |
cmp r11,8 | |
setge r11b |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static void Dot3AosGather(Vector3[] vs, float[] dp) { | |
var xtmp = new float[laneWidth]; | |
var ytmp = new float[laneWidth]; | |
var ztmp = new float[laneWidth]; | |
for (var j = 0; j < reps; ++j) { | |
for (var i = 0; i < dp.Length; i += laneWidth) { | |
for (var k = 0; k < laneWidth; ++k) { | |
xtmp[k] = vs[i + k].X; | |
ytmp[k] = vs[i + k].Y; | |
ztmp[k] = vs[i + k].Z; |