Skip to content

Instantly share code, notes, and snippets.

@dlandahl
Last active October 12, 2021 11:06
Show Gist options
  • Save dlandahl/21df941f2ba3d4fe9f3bf94ab4da17d6 to your computer and use it in GitHub Desktop.
Save dlandahl/21df941f2ba3d4fe9f3bf94ab4da17d6 to your computer and use it in GitHub Desktop.
Fast lookup table based oscillator using AVX2 Gather
#if AVX2_ENABLED
__m256 read_vector(__m256 phase) {
phase = _mm256_mul_ps(phase, _mm256_set1_ps(TABLE_SIZE));
__m256i a = _mm256_cvtps_epi32(_mm256_floor_ps(phase));
__m256i b = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_add_ps(phase, _mm256_set1_ps(1))));
__m256 t = _mm256_sub_ps(phase, _mm256_cvtepi32_ps(a));
__m256 lower = _mm256_i32gather_ps(sin_table, a, 4);
__m256 upper = _mm256_i32gather_ps(sin_table, b, 4);
upper = _mm256_mul_ps(upper, t);
t = _mm256_sub_ps(_mm256_set1_ps(1), t);
lower = _mm256_mul_ps(lower, t);
return _mm256_add_ps(lower, upper);
}
void vector256_table_additive_synthesis(f32* data, int count) {
memset(data, 0, count * sizeof(f32));
assert(count % 8 == 0);
f64 mul = 50.0 / SAMPLE_RATE;
__m256 indices = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
__m256 max = _mm256_set1_ps(1);
for (int p = 1; p <= 10; p++) {
f64 phase = 0;
f64 step_size = p * mul;
__m256 amplitude = _mm256_set1_ps(1.0 / p);
__m256 offset = _mm256_mul_ps(_mm256_set1_ps(step_size), indices);
for (int n = 0; n < count; n += 8) {
__m256 vdata = _mm256_load_ps(data + n);
__m256 vphase = _mm256_add_ps(_mm256_set1_ps(phase), offset);
__m256 mask = _mm256_cmp_ps(vphase, max, _CMP_NLT_UQ);
vphase = _mm256_sub_ps(vphase, _mm256_and_ps(max, mask));
__m256 sine = read_vector(vphase);
sine = _mm256_mul_ps(sine, amplitude);
vdata = _mm256_add_ps(sine, vdata);
_mm256_store_ps(data + n, vdata);
phase += step_size * 8;
phase -= 1 * (phase > 1);
}
}
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment