Skip to content

Instantly share code, notes, and snippets.

View FrankNiemeyer's full-sized avatar

Frank Niemeyer FrankNiemeyer

View GitHub Profile
lea rcx,[rcx-0Ch]
lea rax,[rax-4]
vmovups xmm0,xmmword ptr [rcx]
vdpps xmm0,xmm0,xmm0,71h
vmovss dword ptr [rax],xmm0
sub edx,1
jne benchmark<<lambda_d1ac89d5e59a169233af7a419374e043>,<lambda_c092a5680821d9f5b8bc5a7043f59100> >+0C0h (07FF7B83F2100h)
void dot3_aos_vector_dp(const vector<Vec3f>& vs, vector<float>& dp) {
// 0000 0000 0111 0001: mul lower three components, store sum in lowest component
static const auto mask = 0x71;
for (auto j = 0; j < reps; ++j) {
const auto pvs = (float*)vs.data();
auto pdp = (float*)dp.data();
auto i = vector_len;
while (i--) {
// load 16 bytes (xyz|x)
lea rax,[rax-0Ch]
lea rdx,[rdx-4]
vmovss xmm0,dword ptr [rax-8]
vmovss xmm2,dword ptr [rax-4]
vmovss xmm3,dword ptr [rax]
vmulss xmm1,xmm0,xmm0
vmulss xmm0,xmm2,xmm2
vaddss xmm2,xmm1,xmm0
vmulss xmm1,xmm3,xmm3
vaddss xmm2,xmm2,xmm1
void dot3_aos_scalar(const vector<Vec3f>& vs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
auto i = vector_len;
while (i--) {
dp[i] = vs[i].x * vs[i].x + vs[i].y * vs[i].y + vs[i].z * vs[i].z;
}
}
}
lea ebp,[r11+7]
cmp ebp,esi
jae 00007FF98D06828A
vmovupd ymm0,ymmword ptr [rcx+r11*4+10h]
cmp ebp,edi
jae 00007FF98D06828A
vmovupd ymm1,ymmword ptr [rdx+r11*4+10h]
cmp ebp,ebx
jae 00007FF98D06828A
vmovupd ymm2,ymmword ptr [r8+r11*4+10h]
static void Dot3SoaVectorized(float[] xs, float[] ys, float[] zs, float[] dp) {
for (var j = 0; j < reps; ++j) {
for (var i = 0; i < dp.Length; i += laneWidth) {
var x = new Vector<float>(xs, i);
var y = new Vector<float>(ys, i);
var z = new Vector<float>(zs, i);
var d = x * x + y * y + z * z;
d.CopyTo(dp, i);
}
}
movsxd rsi,r11d
vmovss xmm0,dword ptr [rcx+rsi*4+10h]
vmulss xmm0,xmm0,xmm0
movsxd rsi,r11d
vmovss xmm1,dword ptr [rdx+rsi*4+10h]
vmulss xmm1,xmm1,xmm1
vaddss xmm0,xmm0,xmm1
movsxd rsi,r11d
vmovss xmm1,dword ptr [r8+rsi*4+10h]
vmulss xmm1,xmm1,xmm1
static void Dot3SoaScalar(float[] xs, float[] ys, float[] zs, float[] dp) {
for (var j = 0; j < reps; ++j) {
for (var i = 0; i < dp.Length; ++i) {
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i];
}
}
}
xor r9d,r9d
mov r10d,dword ptr [rbx+8]
movsxd r10,r10d
cmp r10,8
setge r10b
movzx r10d,r10b
mov r11d,dword ptr [rbp+8]
movsxd r11,r11d
cmp r11,8
setge r11b
static void Dot3AosGather(Vector3[] vs, float[] dp) {
var xtmp = new float[laneWidth];
var ytmp = new float[laneWidth];
var ztmp = new float[laneWidth];
for (var j = 0; j < reps; ++j) {
for (var i = 0; i < dp.Length; i += laneWidth) {
for (var k = 0; k < laneWidth; ++k) {
xtmp[k] = vs[i + k].X;
ytmp[k] = vs[i + k].Y;
ztmp[k] = vs[i + k].Z;