Skip to content

Instantly share code, notes, and snippets.

@Rod-Persky
Created October 6, 2014 11:05
Show Gist options
  • Save Rod-Persky/5019f95630e0fede6629 to your computer and use it in GitHub Desktop.
Save Rod-Persky/5019f95630e0fede6629 to your computer and use it in GitHub Desktop.
Pure C vector reduce
int step_8 = size / 8; int step_8_r = size % 8;
int step_4 = step_8_r / 4; int step_4_r = step_8_r % 4;
int step_2 = step_4_r / 2; int step_2_r = step_4_r % 2;
double value = inital_value;
for (int step_n = 0; step_n < step_8; step_n++){
int offset = 8 * step_n;
__m256d a = _mm256_set_pd(values[offset], values[offset + 1], values[offset + 2], values[offset + 3]);
__m256d b = _mm256_set_pd(values[offset + 4], values[offset + 5], values[offset + 6], values[offset + 7]);
__m256d sum = _mm256_hadd_pd(a, b);
__m128d sum_high = _mm256_extractf128_pd(sum, 1);
__m128d result = _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
value += result.m128d_f64[0] + result.m128d_f64[1];
}
for (int step_n = 0; step_n < step_4; step_n++){
int offset = 8 * step_8 + 4 * step_n;
__m128d a = _mm_set_pd(values[offset], values[offset + 1]);
__m128d b = _mm_set_pd(values[offset + 2], values[offset + 3]);
__m128d sum = _mm_hadd_pd(a, b);
value += sum.m128d_f64[0] + sum.m128d_f64[1];
}
if (step_2 != 0) {
int offset = 8 * step_8 + 4 * step_4;
value += values[offset] + values[offset + 1];
}
if (step_2_r != 0) {
value += values[8 * step_8 + 4 * step_4 + 2 * step_2];
}
@Rod-Persky
Copy link
Author

step_8 requires AVX,
step_4 requires SSE3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment