Skip to content

Instantly share code, notes, and snippets.

@FrankNiemeyer
Created August 28, 2015 17:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FrankNiemeyer/1e727dc50bfa15fde6b8 to your computer and use it in GitHub Desktop.
Save FrankNiemeyer/1e727dc50bfa15fde6b8 to your computer and use it in GitHub Desktop.
void dot3_aos_vector_gather(const vector<Vec3f>& vs, vector<float>& dp) {
static const auto epi32_one = _mm256_set1_epi32(1);
static const auto x_offsets = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
static const auto y_offsets = _mm256_add_epi32(x_offsets, epi32_one);
static const auto z_offsets = _mm256_add_epi32(y_offsets, epi32_one);
for (auto j = 0; j < reps; ++j) {
const auto pvs = (float*)vs.data();
auto pdp = (__m256*)dp.data();
auto i = vector_len / lane_width;
while(i--) {
// xyz|xyz|xyz|xyz|xyz|xyz|xyz|xyz -> load 8 * 3 = 24 scattered floats
const auto base = pvs + i * 3 * lane_width;
const auto xs = _mm256_i32gather_ps(base, x_offsets, sizeof(float));
const auto ys = _mm256_i32gather_ps(base, y_offsets, sizeof(float));
const auto zs = _mm256_i32gather_ps(base, z_offsets, sizeof(float));
const auto xx = _mm256_mul_ps(xs, xs);
const auto yyxx = _mm256_fmadd_ps(ys, ys, xx);
const auto zzyyxx = _mm256_fmadd_ps(zs, zs, yyxx);
pdp[i] = zzyyxx;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment