Skip to content

Instantly share code, notes, and snippets.

@astojanov
Last active July 18, 2018 15:30
Show Gist options
  • Save astojanov/7e70e2faa746a9cb6802f4d642faafd0 to your computer and use it in GitHub Desktop.
Save astojanov/7e70e2faa746a9cb6802f4d642faafd0 to your computer and use it in GitHub Desktop.
#include <immintrin.h>
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <cassert>
//
// Transpose 8x8 registers
//
static inline void _mm256_transpose8_epi32 (
__m256i &r0, __m256i &r1, __m256i &r2, __m256i &r3,
__m256i &r4, __m256i &r5, __m256i &r6, __m256i &r7
){
__m256 u0, u1, u2, u3, u4, u5, u6, u7;
__m256 s0, s1, s2, s3, s4, s5, s6, s7;
u0 = (__m256) _mm256_unpacklo_epi32(r0, r1);
u1 = (__m256) _mm256_unpackhi_epi32(r0, r1);
u2 = (__m256) _mm256_unpacklo_epi32(r2, r3);
u3 = (__m256) _mm256_unpackhi_epi32(r2, r3);
u4 = (__m256) _mm256_unpacklo_epi32(r4, r5);
u5 = (__m256) _mm256_unpackhi_epi32(r4, r5);
u6 = (__m256) _mm256_unpacklo_epi32(r6, r7);
u7 = (__m256) _mm256_unpackhi_epi32(r6, r7);
s0 = _mm256_shuffle_ps(u0,u2,_MM_SHUFFLE(1,0,1,0));
s1 = _mm256_shuffle_ps(u0,u2,_MM_SHUFFLE(3,2,3,2));
s2 = _mm256_shuffle_ps(u1,u3,_MM_SHUFFLE(1,0,1,0));
s3 = _mm256_shuffle_ps(u1,u3,_MM_SHUFFLE(3,2,3,2));
s4 = _mm256_shuffle_ps(u4,u6,_MM_SHUFFLE(1,0,1,0));
s5 = _mm256_shuffle_ps(u4,u6,_MM_SHUFFLE(3,2,3,2));
s6 = _mm256_shuffle_ps(u5,u7,_MM_SHUFFLE(1,0,1,0));
s7 = _mm256_shuffle_ps(u5,u7,_MM_SHUFFLE(3,2,3,2));
r0 = (__m256i) _mm256_permute2f128_ps(s0, s4, 0x20);
r1 = (__m256i) _mm256_permute2f128_ps(s1, s5, 0x20);
r2 = (__m256i) _mm256_permute2f128_ps(s2, s6, 0x20);
r3 = (__m256i) _mm256_permute2f128_ps(s3, s7, 0x20);
r4 = (__m256i) _mm256_permute2f128_ps(s0, s4, 0x31);
r5 = (__m256i) _mm256_permute2f128_ps(s1, s5, 0x31);
r6 = (__m256i) _mm256_permute2f128_ps(s2, s6, 0x31);
r7 = (__m256i) _mm256_permute2f128_ps(s3, s7, 0x31);
}
void pack_vector (uint64_t n, const int32_t * u, int8_t * r)
{
//
// Make sure that the vector size that is a multiple of 64
//
const uint64_t blocks = n / 64;
assert(n % 64 == 0);
for (uint64_t b = 0; b < blocks; b += 1) {
const uint64_t offset = b * 64;
const int32_t * u1 = u + offset;
//
// Get 64 values of 32-bit integers into 8 AVX registers
//
__m256i q_1 = _mm256_loadu_si256((__m256i *)(u1 + 0));
__m256i q_2 = _mm256_loadu_si256((__m256i *)(u1 + 8));
__m256i q_3 = _mm256_loadu_si256((__m256i *)(u1 + 16));
__m256i q_4 = _mm256_loadu_si256((__m256i *)(u1 + 24));
__m256i q_5 = _mm256_loadu_si256((__m256i *)(u1 + 32));
__m256i q_6 = _mm256_loadu_si256((__m256i *)(u1 + 40));
__m256i q_7 = _mm256_loadu_si256((__m256i *)(u1 + 48));
__m256i q_8 = _mm256_loadu_si256((__m256i *)(u1 + 56));
//
// Transpose the 8x8 registers
//
_mm256_transpose8_epi32(q_1, q_2, q_3, q_4, q_5, q_6, q_7, q_8);
//
// Shift values left
//
q_1 = _mm256_slli_epi32(q_1, 28);
q_2 = _mm256_slli_epi32(q_2, 28);
q_3 = _mm256_slli_epi32(q_3, 28);
q_4 = _mm256_slli_epi32(q_4, 28);
q_5 = _mm256_slli_epi32(q_5, 28);
q_6 = _mm256_slli_epi32(q_6, 28);
q_7 = _mm256_slli_epi32(q_7, 28);
q_8 = _mm256_slli_epi32(q_8, 28);
//
// Shift values right (zero-extend)
//
q_1 = _mm256_srli_epi32(q_1, 7 * 4);
q_2 = _mm256_srli_epi32(q_2, 6 * 4);
q_3 = _mm256_srli_epi32(q_3, 5 * 4);
q_4 = _mm256_srli_epi32(q_4, 4 * 4);
q_5 = _mm256_srli_epi32(q_5, 3 * 4);
q_6 = _mm256_srli_epi32(q_6, 2 * 4);
q_7 = _mm256_srli_epi32(q_7, 1 * 4);
q_8 = _mm256_srli_epi32(q_8, 0 * 4);
//
// Pack together
//
const __m256i t1 = _mm256_or_si256(q_1, q_2);
const __m256i t2 = _mm256_or_si256(q_3, q_4);
const __m256i t3 = _mm256_or_si256(q_5, q_6);
const __m256i t4 = _mm256_or_si256(q_7, q_8);
const __m256i t5 = _mm256_or_si256(t1, t2);
const __m256i t6 = _mm256_or_si256(t3, t4);
const __m256i t7 = _mm256_or_si256(t5, t6);
//
// Store the result
//
_mm256_storeu_si256((__m256i *)(r + (offset >> 1)), t7);
}
}
inline void restore_vector(uint64_t n, int8_t * u, int32_t * r)
{
//
// Make sure that the vector size that is a multiple of 64
//
assert(n % 64 == 0);
const uint64_t blocks = n / 64;
for (uint64_t b = 0; b < blocks; b += 1) {
//
// Calculate the offsets
//
const uint64_t offset0 = b * 64;
const uint64_t offset1 = b * 32;
//
// Load 64 4-bit values
//
const __m256i qu_64 = _mm256_loadu_si256((__m256i *) (u + offset1));
//
// Shift values left
//
const __m256i qu_1 = _mm256_slli_epi32(qu_64, 4 * 7);
const __m256i qu_2 = _mm256_slli_epi32(qu_64, 4 * 6);
const __m256i qu_3 = _mm256_slli_epi32(qu_64, 4 * 5);
const __m256i qu_4 = _mm256_slli_epi32(qu_64, 4 * 4);
const __m256i qu_5 = _mm256_slli_epi32(qu_64, 4 * 3);
const __m256i qu_6 = _mm256_slli_epi32(qu_64, 4 * 2);
const __m256i qu_7 = _mm256_slli_epi32(qu_64, 4 * 1);
const __m256i qu_8 = _mm256_slli_epi32(qu_64, 4 * 0);
//
// Shift values right (sign-extent) and obtain 8x8
// 32-bit values
//
__m256i q_1 = _mm256_srai_epi32(qu_1, 28);
__m256i q_2 = _mm256_srai_epi32(qu_2, 28);
__m256i q_3 = _mm256_srai_epi32(qu_3, 28);
__m256i q_4 = _mm256_srai_epi32(qu_4, 28);
__m256i q_5 = _mm256_srai_epi32(qu_5, 28);
__m256i q_6 = _mm256_srai_epi32(qu_6, 28);
__m256i q_7 = _mm256_srai_epi32(qu_7, 28);
__m256i q_8 = _mm256_srai_epi32(qu_8, 28);
//
// Transpose the 8x8 values
//
_mm256_transpose8_epi32(q_1, q_2, q_3, q_4, q_5, q_6, q_7, q_8);
//
// Store the result in the output array
//
int32_t * u1 = r + offset0;
_mm256_storeu_si256((__m256i *)(u1 + 0), q_1);
_mm256_storeu_si256((__m256i *)(u1 + 8), q_2);
_mm256_storeu_si256((__m256i *)(u1 + 16), q_3);
_mm256_storeu_si256((__m256i *)(u1 + 24), q_4);
_mm256_storeu_si256((__m256i *)(u1 + 32), q_5);
_mm256_storeu_si256((__m256i *)(u1 + 40), q_6);
_mm256_storeu_si256((__m256i *)(u1 + 48), q_7);
_mm256_storeu_si256((__m256i *)(u1 + 56), q_8);
}
}
inline int32_t rnd_int32t(int32_t max_range)
{
float rnd0 = rand() / static_cast <float> (RAND_MAX);
float rnd1 = rand() / static_cast <float> (RAND_MAX);
float rnd2 = rnd1 > 0.5f ? 1 : -1;
float rnd3 = rnd0 * rnd2 * max_range;
return (int32_t) rnd3;
}
void add_odd_even(uint64_t n, int8_t * v, int8_t * r)
{
//
// Make sure that the vector size that is a multiple of 128
//
assert(n % 128 == 0);
const uint64_t blocks = n / 64;
//
// Define constants that will be used for masking operations
//
const __m256i hi_mask_08 = _mm256_set1_epi8(-16);
const __m256i lo_mask_16 = _mm256_set1_epi16(0x0F);
const __m256i hi_mask_16 = _mm256_set1_epi16(0xF0);
for (uint64_t b = 0; b < blocks; b += 2) {
//
// Calculate the offsets
//
const uint64_t offset0 = b * 32;
const uint64_t offset1 = b * 32 + 32;
const uint64_t offset2 = b * 32 / 2;
//
// Load 128 values in two AVX registers. Each register will
// contain 64 x 4-bit values in the range [-8, 7].
//
const __m256i qv_1 = _mm256_loadu_si256((__m256i *) (v + offset0));
const __m256i qv_2 = _mm256_loadu_si256((__m256i *) (v + offset1));
//
// Extract the odd and the even parts. The values will be split in
// two registers qv_odd_shift and qv_evn_shift, each of them having
// 32 x 8-bit values, such that each value is multiplied by 2^4
// and resides in the range [-8 * 2^4, 7 * 2^4]
//
const __m256i qv_odd_dirty_1 = _mm256_slli_epi16(qv_1, 4);
const __m256i qv_odd_shift_1 = _mm256_and_si256(hi_mask_08, qv_odd_dirty_1);
const __m256i qv_evn_shift_1 = _mm256_and_si256(hi_mask_08, qv_1);
const __m256i qv_odd_dirty_2 = _mm256_slli_epi16(qv_2, 4);
const __m256i qv_odd_shift_2 = _mm256_and_si256(hi_mask_08, qv_odd_dirty_2);
const __m256i qv_evn_shift_2 = _mm256_and_si256(hi_mask_08, qv_2);
//
// Perform addition. In case of overflows / underflows, behaviour
// is undefined. Values are still in the range [-8 * 2^4, 7 * 2^4].
//
const __m256i qv_sum_shift_1 = _mm256_add_epi8(qv_odd_shift_1, qv_evn_shift_1);
const __m256i qv_sum_shift_2 = _mm256_add_epi8(qv_odd_shift_2, qv_evn_shift_2);
//
// Divide by 2^4. At this point in time, each of the two AVX registers holds
// 32 x 8-bit values that are in the range of [-8, 7]. Summation is complete.
//
const __m256i qv_sum_1 = _mm256_srai_epi16(qv_sum_shift_1, 4);
const __m256i qv_sum_2 = _mm256_srai_epi16(qv_sum_shift_2, 4);
//
// Now, we want to take the even numbers of the 32 x 4-bit register, and
// store them in the high-bits of the odd numbers. We do this with
// left shifts that extend in zero, and 16-bit masks. This operation
// results in two registers qv_sum_lo and qv_sum_hi that hold 32
// values. However, each consecutive 4-bit values reside in the
// low-bits of a 16-bit chunk.
//
const __m256i qv_sum_1_lo = _mm256_and_si256(lo_mask_16, qv_sum_1);
const __m256i qv_sum_1_hi_dirty = _mm256_srli_epi16(qv_sum_shift_1, 8);
const __m256i qv_sum_1_hi = _mm256_and_si256(hi_mask_16, qv_sum_1_hi_dirty);
const __m256i qv_sum_2_lo = _mm256_and_si256(lo_mask_16, qv_sum_2);
const __m256i qv_sum_2_hi_dirty = _mm256_srli_epi16(qv_sum_shift_2, 8);
const __m256i qv_sum_2_hi = _mm256_and_si256(hi_mask_16, qv_sum_2_hi_dirty);
const __m256i qv_sum_16_1 = _mm256_or_si256(qv_sum_1_lo, qv_sum_1_hi);
const __m256i qv_sum_16_2 = _mm256_or_si256(qv_sum_2_lo, qv_sum_2_hi);
//
// Pack the two registers of 32 x 4-bit values, into a single one having
// 64 x 4-bit values. Use the unsigned version, to avoid saturation.
//
const __m256i qv_sum_pack = _mm256_packus_epi16(qv_sum_16_1, qv_sum_16_2);
//
// Interleave the 64-bit chunks.
//
const __m256i qv_sum = _mm256_permute4x64_epi64(qv_sum_pack, 0xD8);
//
// Store the result
//
_mm256_storeu_si256((__m256i *)(r + offset2), qv_sum);
}
}
void validate (uint64_t n)
{
assert(n % 128 == 0);
//
// Create memory space
//
int32_t vector_v_32[n];
int8_t vector_v_04[n / 2];
int8_t vector_r_04[n / 4];
int32_t vector_r_32[n / 2];
//
// Populate some random values, but make sure that
// overflows / underflows are avoided
//
for (uint64_t i = 0; i < n; i += 1) {
vector_v_32[i] = rnd_int32t(4);
}
//
// Pack the vector in 4-bit format
//
pack_vector (n , vector_v_32, vector_v_04);
add_odd_even (n , vector_v_04, vector_r_04);
restore_vector (n/2, vector_r_04, vector_r_32);
for (uint64_t i = 0; i < n/2; i += 1) {
int32_t a = vector_v_32[2 * i + 0];
int32_t b = vector_v_32[2 * i + 1];
if (a + b != vector_r_32[i]) {
std::cout << "add_odd_even fails at position: " << i << std::endl;
for (uint64_t j = 0; j < n/2; j += 1) {
std::cout << std::setw(4) << j << ": ";
std::cout << std::setw(4) << vector_v_32[2 * j + 0] << " + ";
std::cout << std::setw(4) << vector_v_32[2 * j + 1] << " = ";
std::cout << std::setw(4) << vector_r_32[j] << std::endl;
}
exit(1);
}
}
}
int main()
{
for (int i = 0; i < 1000; i += 1) {
validate(1280);
}
std::cout << "Done!" << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment