Skip to content

Instantly share code, notes, and snippets.

View FrankNiemeyer's full-sized avatar

Frank Niemeyer FrankNiemeyer

View GitHub Profile
lea rax,[rax-20h]
vmovups ymm0,ymmword ptr [rbx+rax]
vmovups ymm2,ymmword ptr [rax]
vmovups ymm3,ymmword ptr [rdi+rax]
vmulps ymm0,ymm0,ymm0
vfmadd231ps ymm0,ymm2,ymm2
vfmadd231ps ymm0,ymm3,ymm3
vmovups ymmword ptr [rsi+rax],ymm0
sub r11d,1
jne dot3_soa_vectorized+52h (07FF7DF9913D2h)
void dot3_soa_vectorized_fma(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
const auto px = (__m256*)xs.data();
const auto py = (__m256*)ys.data();
const auto pz = (__m256*)zs.data();
auto pd = (__m256*)dp.data();
auto i = vector_len / lane_width;
while (i--) {
pd[i] = _mm256_fmadd_ps(pz[i], pz[i], _mm256_fmadd_ps(py[i], py[i], _mm256_mul_ps(px[i], px[i])));
}
lea rax,[rax-20h]
vmovups ymm0,ymmword ptr [rbx+rax]
vmovups ymm2,ymmword ptr [rax]
vmovups ymm3,ymmword ptr [rdi+rax]
vmulps ymm1,ymm0,ymm0
vmulps ymm0,ymm2,ymm2
vaddps ymm2,ymm1,ymm0
vmulps ymm1,ymm3,ymm3
vaddps ymm2,ymm2,ymm1
vmovups ymmword ptr [rsi+rax],ymm2
void dot3_soa_vectorized(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
const auto px = (__m256*)xs.data();
const auto py = (__m256*)ys.data();
const auto pz = (__m256*)zs.data();
auto pd = (__m256*)dp.data();
auto i = vector_len / lane_width;
while (i--) {
pd[i] = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(px[i], px[i]), _mm256_mul_ps(py[i], py[i])), _mm256_mul_ps(pz[i], pz[i]));
}
lea rax,[rcx+rbx]
vmovups ymm2,ymmword ptr [r10+rcx]
vmovups ymm3,ymmword ptr [rcx]
vmovups ymm0,ymmword ptr [r11+rcx]
lea rcx,[rcx+20h]
vmulps ymm1,ymm2,ymm2
vfmadd231ps ymm1,ymm3,ymm3
vfmadd231ps ymm1,ymm0,ymm0
vmovups ymmword ptr [rax+r8],ymm1
sub r9,1
void dot3_soa_autovec(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
for (auto i = 0; i < vector_len; ++i) {
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i];
}
}
}
lea rax,[rax-4]
vmovss xmm0,dword ptr [rax]
vmovss xmm2,dword ptr [rdx+rax]
vmovss xmm3,dword ptr [r8+rax]
vmulss xmm1,xmm0,xmm0
vmulss xmm0,xmm2,xmm2
vaddss xmm2,xmm1,xmm0
vmulss xmm1,xmm3,xmm3
vaddss xmm2,xmm2,xmm1
vmovss dword ptr [r9+rax],xmm2
void dot3_soa_scalar(const vector<float>& xs, const vector<float>& ys, const vector<float>& zs, vector<float>& dp) {
for (auto j = 0; j < reps; ++j) {
auto i = vector_len;
while(i--) {
dp[i] = xs[i] * xs[i] + ys[i] * ys[i] + zs[i] * zs[i];
}
}
}
lea rax,[rax-60h]
lea r8,[r8-20h]
vpcmpeqb ymm2,ymm2,ymm2
vmovups ymm5,ymm0
vgatherdps ymm5,dword ptr [rax+ymm6*4],ymm2
vpcmpeqb ymm2,ymm2,ymm2
vmovups ymm4,ymm0
vgatherdps ymm4,dword ptr [rax+ymm7*4],ymm2
vpcmpeqb ymm2,ymm2,ymm2
vmovups ymm1,ymm0
void dot3_aos_vector_gather(const vector<Vec3f>& vs, vector<float>& dp) {
static const auto epi32_one = _mm256_set1_epi32(1);
static const auto x_offsets = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
static const auto y_offsets = _mm256_add_epi32(x_offsets, epi32_one);
static const auto z_offsets = _mm256_add_epi32(y_offsets, epi32_one);
for (auto j = 0; j < reps; ++j) {
const auto pvs = (float*)vs.data();
auto pdp = (__m256*)dp.data();
auto i = vector_len / lane_width;