Last active
July 18, 2018 15:30
-
-
Save astojanov/7e70e2faa746a9cb6802f4d642faafd0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <cstdint> | |
#include <iostream> | |
#include <iomanip> | |
#include <cassert> | |
// | |
// Transpose 8x8 registers | |
// | |
static inline void _mm256_transpose8_epi32 ( | |
__m256i &r0, __m256i &r1, __m256i &r2, __m256i &r3, | |
__m256i &r4, __m256i &r5, __m256i &r6, __m256i &r7 | |
){ | |
__m256 u0, u1, u2, u3, u4, u5, u6, u7; | |
__m256 s0, s1, s2, s3, s4, s5, s6, s7; | |
u0 = (__m256) _mm256_unpacklo_epi32(r0, r1); | |
u1 = (__m256) _mm256_unpackhi_epi32(r0, r1); | |
u2 = (__m256) _mm256_unpacklo_epi32(r2, r3); | |
u3 = (__m256) _mm256_unpackhi_epi32(r2, r3); | |
u4 = (__m256) _mm256_unpacklo_epi32(r4, r5); | |
u5 = (__m256) _mm256_unpackhi_epi32(r4, r5); | |
u6 = (__m256) _mm256_unpacklo_epi32(r6, r7); | |
u7 = (__m256) _mm256_unpackhi_epi32(r6, r7); | |
s0 = _mm256_shuffle_ps(u0,u2,_MM_SHUFFLE(1,0,1,0)); | |
s1 = _mm256_shuffle_ps(u0,u2,_MM_SHUFFLE(3,2,3,2)); | |
s2 = _mm256_shuffle_ps(u1,u3,_MM_SHUFFLE(1,0,1,0)); | |
s3 = _mm256_shuffle_ps(u1,u3,_MM_SHUFFLE(3,2,3,2)); | |
s4 = _mm256_shuffle_ps(u4,u6,_MM_SHUFFLE(1,0,1,0)); | |
s5 = _mm256_shuffle_ps(u4,u6,_MM_SHUFFLE(3,2,3,2)); | |
s6 = _mm256_shuffle_ps(u5,u7,_MM_SHUFFLE(1,0,1,0)); | |
s7 = _mm256_shuffle_ps(u5,u7,_MM_SHUFFLE(3,2,3,2)); | |
r0 = (__m256i) _mm256_permute2f128_ps(s0, s4, 0x20); | |
r1 = (__m256i) _mm256_permute2f128_ps(s1, s5, 0x20); | |
r2 = (__m256i) _mm256_permute2f128_ps(s2, s6, 0x20); | |
r3 = (__m256i) _mm256_permute2f128_ps(s3, s7, 0x20); | |
r4 = (__m256i) _mm256_permute2f128_ps(s0, s4, 0x31); | |
r5 = (__m256i) _mm256_permute2f128_ps(s1, s5, 0x31); | |
r6 = (__m256i) _mm256_permute2f128_ps(s2, s6, 0x31); | |
r7 = (__m256i) _mm256_permute2f128_ps(s3, s7, 0x31); | |
} | |
void pack_vector (uint64_t n, const int32_t * u, int8_t * r) | |
{ | |
// | |
// Make sure that the vector size that is a multiple of 64 | |
// | |
const uint64_t blocks = n / 64; | |
assert(n % 64 == 0); | |
for (uint64_t b = 0; b < blocks; b += 1) { | |
const uint64_t offset = b * 64; | |
const int32_t * u1 = u + offset; | |
// | |
// Get 64 values of 32-bit integers into 8 AVX registers | |
// | |
__m256i q_1 = _mm256_loadu_si256((__m256i *)(u1 + 0)); | |
__m256i q_2 = _mm256_loadu_si256((__m256i *)(u1 + 8)); | |
__m256i q_3 = _mm256_loadu_si256((__m256i *)(u1 + 16)); | |
__m256i q_4 = _mm256_loadu_si256((__m256i *)(u1 + 24)); | |
__m256i q_5 = _mm256_loadu_si256((__m256i *)(u1 + 32)); | |
__m256i q_6 = _mm256_loadu_si256((__m256i *)(u1 + 40)); | |
__m256i q_7 = _mm256_loadu_si256((__m256i *)(u1 + 48)); | |
__m256i q_8 = _mm256_loadu_si256((__m256i *)(u1 + 56)); | |
// | |
// Transpose the 8x8 registers | |
// | |
_mm256_transpose8_epi32(q_1, q_2, q_3, q_4, q_5, q_6, q_7, q_8); | |
// | |
// Shift values left | |
// | |
q_1 = _mm256_slli_epi32(q_1, 28); | |
q_2 = _mm256_slli_epi32(q_2, 28); | |
q_3 = _mm256_slli_epi32(q_3, 28); | |
q_4 = _mm256_slli_epi32(q_4, 28); | |
q_5 = _mm256_slli_epi32(q_5, 28); | |
q_6 = _mm256_slli_epi32(q_6, 28); | |
q_7 = _mm256_slli_epi32(q_7, 28); | |
q_8 = _mm256_slli_epi32(q_8, 28); | |
// | |
// Shift values right (zero-extend) | |
// | |
q_1 = _mm256_srli_epi32(q_1, 7 * 4); | |
q_2 = _mm256_srli_epi32(q_2, 6 * 4); | |
q_3 = _mm256_srli_epi32(q_3, 5 * 4); | |
q_4 = _mm256_srli_epi32(q_4, 4 * 4); | |
q_5 = _mm256_srli_epi32(q_5, 3 * 4); | |
q_6 = _mm256_srli_epi32(q_6, 2 * 4); | |
q_7 = _mm256_srli_epi32(q_7, 1 * 4); | |
q_8 = _mm256_srli_epi32(q_8, 0 * 4); | |
// | |
// Pack together | |
// | |
const __m256i t1 = _mm256_or_si256(q_1, q_2); | |
const __m256i t2 = _mm256_or_si256(q_3, q_4); | |
const __m256i t3 = _mm256_or_si256(q_5, q_6); | |
const __m256i t4 = _mm256_or_si256(q_7, q_8); | |
const __m256i t5 = _mm256_or_si256(t1, t2); | |
const __m256i t6 = _mm256_or_si256(t3, t4); | |
const __m256i t7 = _mm256_or_si256(t5, t6); | |
// | |
// Store the result | |
// | |
_mm256_storeu_si256((__m256i *)(r + (offset >> 1)), t7); | |
} | |
} | |
inline void restore_vector(uint64_t n, int8_t * u, int32_t * r) | |
{ | |
// | |
// Make sure that the vector size that is a multiple of 64 | |
// | |
assert(n % 64 == 0); | |
const uint64_t blocks = n / 64; | |
for (uint64_t b = 0; b < blocks; b += 1) { | |
// | |
// Calculate the offsets | |
// | |
const uint64_t offset0 = b * 64; | |
const uint64_t offset1 = b * 32; | |
// | |
// Load 64 4-bit values | |
// | |
const __m256i qu_64 = _mm256_loadu_si256((__m256i *) (u + offset1)); | |
// | |
// Shift values left | |
// | |
const __m256i qu_1 = _mm256_slli_epi32(qu_64, 4 * 7); | |
const __m256i qu_2 = _mm256_slli_epi32(qu_64, 4 * 6); | |
const __m256i qu_3 = _mm256_slli_epi32(qu_64, 4 * 5); | |
const __m256i qu_4 = _mm256_slli_epi32(qu_64, 4 * 4); | |
const __m256i qu_5 = _mm256_slli_epi32(qu_64, 4 * 3); | |
const __m256i qu_6 = _mm256_slli_epi32(qu_64, 4 * 2); | |
const __m256i qu_7 = _mm256_slli_epi32(qu_64, 4 * 1); | |
const __m256i qu_8 = _mm256_slli_epi32(qu_64, 4 * 0); | |
// | |
// Shift values right (sign-extent) and obtain 8x8 | |
// 32-bit values | |
// | |
__m256i q_1 = _mm256_srai_epi32(qu_1, 28); | |
__m256i q_2 = _mm256_srai_epi32(qu_2, 28); | |
__m256i q_3 = _mm256_srai_epi32(qu_3, 28); | |
__m256i q_4 = _mm256_srai_epi32(qu_4, 28); | |
__m256i q_5 = _mm256_srai_epi32(qu_5, 28); | |
__m256i q_6 = _mm256_srai_epi32(qu_6, 28); | |
__m256i q_7 = _mm256_srai_epi32(qu_7, 28); | |
__m256i q_8 = _mm256_srai_epi32(qu_8, 28); | |
// | |
// Transpose the 8x8 values | |
// | |
_mm256_transpose8_epi32(q_1, q_2, q_3, q_4, q_5, q_6, q_7, q_8); | |
// | |
// Store the result in the output array | |
// | |
int32_t * u1 = r + offset0; | |
_mm256_storeu_si256((__m256i *)(u1 + 0), q_1); | |
_mm256_storeu_si256((__m256i *)(u1 + 8), q_2); | |
_mm256_storeu_si256((__m256i *)(u1 + 16), q_3); | |
_mm256_storeu_si256((__m256i *)(u1 + 24), q_4); | |
_mm256_storeu_si256((__m256i *)(u1 + 32), q_5); | |
_mm256_storeu_si256((__m256i *)(u1 + 40), q_6); | |
_mm256_storeu_si256((__m256i *)(u1 + 48), q_7); | |
_mm256_storeu_si256((__m256i *)(u1 + 56), q_8); | |
} | |
} | |
inline int32_t rnd_int32t(int32_t max_range) | |
{ | |
float rnd0 = rand() / static_cast <float> (RAND_MAX); | |
float rnd1 = rand() / static_cast <float> (RAND_MAX); | |
float rnd2 = rnd1 > 0.5f ? 1 : -1; | |
float rnd3 = rnd0 * rnd2 * max_range; | |
return (int32_t) rnd3; | |
} | |
void add_odd_even(uint64_t n, int8_t * v, int8_t * r) | |
{ | |
// | |
// Make sure that the vector size that is a multiple of 128 | |
// | |
assert(n % 128 == 0); | |
const uint64_t blocks = n / 64; | |
// | |
// Define constants that will be used for masking operations | |
// | |
const __m256i hi_mask_08 = _mm256_set1_epi8(-16); | |
const __m256i lo_mask_16 = _mm256_set1_epi16(0x0F); | |
const __m256i hi_mask_16 = _mm256_set1_epi16(0xF0); | |
for (uint64_t b = 0; b < blocks; b += 2) { | |
// | |
// Calculate the offsets | |
// | |
const uint64_t offset0 = b * 32; | |
const uint64_t offset1 = b * 32 + 32; | |
const uint64_t offset2 = b * 32 / 2; | |
// | |
// Load 128 values in two AVX registers. Each register will | |
// contain 64 x 4-bit values in the range [-8, 7]. | |
// | |
const __m256i qv_1 = _mm256_loadu_si256((__m256i *) (v + offset0)); | |
const __m256i qv_2 = _mm256_loadu_si256((__m256i *) (v + offset1)); | |
// | |
// Extract the odd and the even parts. The values will be split in | |
// two registers qv_odd_shift and qv_evn_shift, each of them having | |
// 32 x 8-bit values, such that each value is multiplied by 2^4 | |
// and resides in the range [-8 * 2^4, 7 * 2^4] | |
// | |
const __m256i qv_odd_dirty_1 = _mm256_slli_epi16(qv_1, 4); | |
const __m256i qv_odd_shift_1 = _mm256_and_si256(hi_mask_08, qv_odd_dirty_1); | |
const __m256i qv_evn_shift_1 = _mm256_and_si256(hi_mask_08, qv_1); | |
const __m256i qv_odd_dirty_2 = _mm256_slli_epi16(qv_2, 4); | |
const __m256i qv_odd_shift_2 = _mm256_and_si256(hi_mask_08, qv_odd_dirty_2); | |
const __m256i qv_evn_shift_2 = _mm256_and_si256(hi_mask_08, qv_2); | |
// | |
// Perform addition. In case of overflows / underflows, behaviour | |
// is undefined. Values are still in the range [-8 * 2^4, 7 * 2^4]. | |
// | |
const __m256i qv_sum_shift_1 = _mm256_add_epi8(qv_odd_shift_1, qv_evn_shift_1); | |
const __m256i qv_sum_shift_2 = _mm256_add_epi8(qv_odd_shift_2, qv_evn_shift_2); | |
// | |
// Divide by 2^4. At this point in time, each of the two AVX registers holds | |
// 32 x 8-bit values that are in the range of [-8, 7]. Summation is complete. | |
// | |
const __m256i qv_sum_1 = _mm256_srai_epi16(qv_sum_shift_1, 4); | |
const __m256i qv_sum_2 = _mm256_srai_epi16(qv_sum_shift_2, 4); | |
// | |
// Now, we want to take the even numbers of the 32 x 4-bit register, and | |
// store them in the high-bits of the odd numbers. We do this with | |
// left shifts that extend in zero, and 16-bit masks. This operation | |
// results in two registers qv_sum_lo and qv_sum_hi that hold 32 | |
// values. However, each consecutive 4-bit values reside in the | |
// low-bits of a 16-bit chunk. | |
// | |
const __m256i qv_sum_1_lo = _mm256_and_si256(lo_mask_16, qv_sum_1); | |
const __m256i qv_sum_1_hi_dirty = _mm256_srli_epi16(qv_sum_shift_1, 8); | |
const __m256i qv_sum_1_hi = _mm256_and_si256(hi_mask_16, qv_sum_1_hi_dirty); | |
const __m256i qv_sum_2_lo = _mm256_and_si256(lo_mask_16, qv_sum_2); | |
const __m256i qv_sum_2_hi_dirty = _mm256_srli_epi16(qv_sum_shift_2, 8); | |
const __m256i qv_sum_2_hi = _mm256_and_si256(hi_mask_16, qv_sum_2_hi_dirty); | |
const __m256i qv_sum_16_1 = _mm256_or_si256(qv_sum_1_lo, qv_sum_1_hi); | |
const __m256i qv_sum_16_2 = _mm256_or_si256(qv_sum_2_lo, qv_sum_2_hi); | |
// | |
// Pack the two registers of 32 x 4-bit values, into a single one having | |
// 64 x 4-bit values. Use the unsigned version, to avoid saturation. | |
// | |
const __m256i qv_sum_pack = _mm256_packus_epi16(qv_sum_16_1, qv_sum_16_2); | |
// | |
// Interleave the 64-bit chunks. | |
// | |
const __m256i qv_sum = _mm256_permute4x64_epi64(qv_sum_pack, 0xD8); | |
// | |
// Store the result | |
// | |
_mm256_storeu_si256((__m256i *)(r + offset2), qv_sum); | |
} | |
} | |
void validate (uint64_t n) | |
{ | |
assert(n % 128 == 0); | |
// | |
// Create memory space | |
// | |
int32_t vector_v_32[n]; | |
int8_t vector_v_04[n / 2]; | |
int8_t vector_r_04[n / 4]; | |
int32_t vector_r_32[n / 2]; | |
// | |
// Populate some random values, but make sure that | |
// overflows / underflows are avoided | |
// | |
for (uint64_t i = 0; i < n; i += 1) { | |
vector_v_32[i] = rnd_int32t(4); | |
} | |
// | |
// Pack the vector in 4-bit format | |
// | |
pack_vector (n , vector_v_32, vector_v_04); | |
add_odd_even (n , vector_v_04, vector_r_04); | |
restore_vector (n/2, vector_r_04, vector_r_32); | |
for (uint64_t i = 0; i < n/2; i += 1) { | |
int32_t a = vector_v_32[2 * i + 0]; | |
int32_t b = vector_v_32[2 * i + 1]; | |
if (a + b != vector_r_32[i]) { | |
std::cout << "add_odd_even fails at position: " << i << std::endl; | |
for (uint64_t j = 0; j < n/2; j += 1) { | |
std::cout << std::setw(4) << j << ": "; | |
std::cout << std::setw(4) << vector_v_32[2 * j + 0] << " + "; | |
std::cout << std::setw(4) << vector_v_32[2 * j + 1] << " = "; | |
std::cout << std::setw(4) << vector_r_32[j] << std::endl; | |
} | |
exit(1); | |
} | |
} | |
} | |
int main() | |
{ | |
for (int i = 0; i < 1000; i += 1) { | |
validate(1280); | |
} | |
std::cout << "Done!" << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment