Created
August 28, 2015 17:07
-
-
Save FrankNiemeyer/1e727dc50bfa15fde6b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void dot3_aos_vector_gather(const vector<Vec3f>& vs, vector<float>& dp) { | |
static const auto epi32_one = _mm256_set1_epi32(1); | |
static const auto x_offsets = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21); | |
static const auto y_offsets = _mm256_add_epi32(x_offsets, epi32_one); | |
static const auto z_offsets = _mm256_add_epi32(y_offsets, epi32_one); | |
for (auto j = 0; j < reps; ++j) { | |
const auto pvs = (float*)vs.data(); | |
auto pdp = (__m256*)dp.data(); | |
auto i = vector_len / lane_width; | |
while(i--) { | |
// xyz|xyz|xyz|xyz|xyz|xyz|xyz|xyz -> load 8 * 3 = 24 scattered floats | |
const auto base = pvs + i * 3 * lane_width; | |
const auto xs = _mm256_i32gather_ps(base, x_offsets, sizeof(float)); | |
const auto ys = _mm256_i32gather_ps(base, y_offsets, sizeof(float)); | |
const auto zs = _mm256_i32gather_ps(base, z_offsets, sizeof(float)); | |
const auto xx = _mm256_mul_ps(xs, xs); | |
const auto yyxx = _mm256_fmadd_ps(ys, ys, xx); | |
const auto zzyyxx = _mm256_fmadd_ps(zs, zs, yyxx); | |
pdp[i] = zzyyxx; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment