-
-
Save IJzerbaard/61d145c55143d56409a158350271e127 to your computer and use it in GitHub Desktop.
invert_permutation64
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__m512i weird_bit_permute(__m512i x) | |
{ | |
__m512i x1 = _mm512_permutexvar_epi8(_mm512_setr_epi8( | |
57, 49, 41, 33, 25, 17, 9, 1, | |
63, 55, 47, 39, 31, 23, 15, 7, | |
61, 53, 45, 37, 29, 21, 13, 5, | |
62, 54, 46, 38, 30, 22, 14, 6, | |
60, 52, 44, 36, 28, 20, 12, 4, | |
56, 48, 40, 32, 24, 16, 8, 0, | |
58, 50, 42, 34, 26, 18, 10, 2, | |
59, 51, 43, 35, 27, 19, 11, 3), x); | |
__m512i x2 = _mm512_gf2p8affine_epi64_epi8(_mm512_set1_epi64(0x0204080140201080), x1, 0); | |
__m512i x3 = _mm512_permutexvar_epi8(_mm512_setr_epi8( | |
44, 4, 52, 60, 36, 20, 28, 12, | |
47, 7, 55, 63, 39, 23, 31, 15, | |
46, 6, 54, 62, 38, 22, 30, 14, | |
45, 5, 53, 61, 37, 21, 29, 13, | |
41, 1, 49, 57, 33, 17, 25, 9, | |
42, 2, 50, 58, 34, 18, 26, 10, | |
43, 3, 51, 59, 35, 19, 27, 11, | |
40, 0, 48, 56, 32, 16, 24, 8), x2); | |
return x3; | |
} | |
void invert_permutation64(uint8_t* p, uint8_t* inv) | |
{ | |
__m512i p0 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 0))); | |
__m512i p1 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 8))); | |
__m512i p2 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 16))); | |
__m512i p3 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 24))); | |
__m512i p4 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 32))); | |
__m512i p5 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 40))); | |
__m512i p6 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 48))); | |
__m512i p7 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 56))); | |
// indexes to masks | |
__m512i m0 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p0); | |
__m512i m1 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p1); | |
__m512i m2 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p2); | |
__m512i m3 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p3); | |
__m512i m4 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p4); | |
__m512i m5 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p5); | |
__m512i m6 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p6); | |
__m512i m7 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p7); | |
// 64x64 transpose | |
// swap 32x32 quadrants | |
__m512i s0 = _mm512_mask_blend_epi32(0xAAAA, m0, _mm512_slli_epi64(m4, 32)); | |
__m512i s1 = _mm512_mask_blend_epi32(0xAAAA, m1, _mm512_slli_epi64(m5, 32)); | |
__m512i s2 = _mm512_mask_blend_epi32(0xAAAA, m2, _mm512_slli_epi64(m6, 32)); | |
__m512i s3 = _mm512_mask_blend_epi32(0xAAAA, m3, _mm512_slli_epi64(m7, 32)); | |
__m512i s4 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m0, 32), m4); | |
__m512i s5 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m1, 32), m5); | |
__m512i s6 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m2, 32), m6); | |
__m512i s7 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m3, 32), m7); | |
// swap 16x16 quadrants | |
__m512i t0 = _mm512_mask_blend_epi16(0xAAAAAAAA, s0, _mm512_slli_epi32(s2, 16)); | |
__m512i t1 = _mm512_mask_blend_epi16(0xAAAAAAAA, s1, _mm512_slli_epi32(s3, 16)); | |
__m512i t2 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s0, 16), s2); | |
__m512i t3 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s1, 16), s3); | |
__m512i t4 = _mm512_mask_blend_epi16(0xAAAAAAAA, s4, _mm512_slli_epi32(s6, 16)); | |
__m512i t5 = _mm512_mask_blend_epi16(0xAAAAAAAA, s5, _mm512_slli_epi32(s7, 16)); | |
__m512i t6 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s4, 16), s6); | |
__m512i t7 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s5, 16), s7); | |
// swap 8x8 quadrants | |
__m512i u0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t0, _mm512_slli_epi16(t1, 8)); | |
__m512i u1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t0, 8), t1); | |
__m512i u2 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t2, _mm512_slli_epi16(t3, 8)); | |
__m512i u3 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t2, 8), t3); | |
__m512i u4 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t4, _mm512_slli_epi16(t5, 8)); | |
__m512i u5 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t4, 8), t5); | |
__m512i u6 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t6, _mm512_slli_epi16(t7, 8)); | |
__m512i u7 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t6, 8), t7); | |
// do the rest of the permute | |
__m512i v0 = weird_bit_permute(u0); | |
__m512i v1 = weird_bit_permute(u1); | |
__m512i v2 = weird_bit_permute(u2); | |
__m512i v3 = weird_bit_permute(u3); | |
__m512i v4 = weird_bit_permute(u4); | |
__m512i v5 = weird_bit_permute(u5); | |
__m512i v6 = weird_bit_permute(u6); | |
__m512i v7 = weird_bit_permute(u7); | |
// masks to indexes | |
__m512i i0 = _mm512_lzcnt_epi64(v0); | |
__m512i i1 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v1), 8); | |
__m512i i2 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v2), 16); | |
__m512i i3 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v3), 24); | |
__m512i i4 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v4), 32); | |
__m512i i5 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v5), 40); | |
__m512i i6 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v6), 48); | |
__m512i i7 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v7), 56); | |
// pack | |
__m512i pck012 = _mm512_ternarylogic_epi64(i0, i1, i2, 0xFE); | |
__m512i pck345 = _mm512_ternarylogic_epi64(i3, i4, i5, 0xFE); | |
__m512i pck67 = _mm512_or_epi64(i6, i7); | |
__m512i pck = _mm512_ternarylogic_epi64(pck012, pck345, pck67, 0xFE); | |
pck = _mm512_permutexvar_epi8(_mm512_setr_epi8( | |
0, 8, 16, 24, 32, 40, 48, 56, | |
1, 9, 17, 25, 33, 41, 49, 57, | |
2, 10, 18, 26, 34, 42, 50, 58, | |
3, 11, 19, 27, 35, 43, 51, 59, | |
4, 12, 20, 28, 36, 44, 52, 60, | |
5, 13, 21, 29, 37, 45, 53, 61, | |
6, 14, 22, 30, 38, 46, 54, 62, | |
7, 15, 23, 31, 39, 47, 55, 63), pck); | |
_mm512_storeu_epi8(inv, _mm512_xor_epi64(pck, _mm512_set1_epi8(63))); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment