Skip to content

Instantly share code, notes, and snippets.

@IJzerbaard
Created January 9, 2023 06:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IJzerbaard/61d145c55143d56409a158350271e127 to your computer and use it in GitHub Desktop.
Save IJzerbaard/61d145c55143d56409a158350271e127 to your computer and use it in GitHub Desktop.
invert_permutation64
__m512i weird_bit_permute(__m512i x)
{
__m512i x1 = _mm512_permutexvar_epi8(_mm512_setr_epi8(
57, 49, 41, 33, 25, 17, 9, 1,
63, 55, 47, 39, 31, 23, 15, 7,
61, 53, 45, 37, 29, 21, 13, 5,
62, 54, 46, 38, 30, 22, 14, 6,
60, 52, 44, 36, 28, 20, 12, 4,
56, 48, 40, 32, 24, 16, 8, 0,
58, 50, 42, 34, 26, 18, 10, 2,
59, 51, 43, 35, 27, 19, 11, 3), x);
__m512i x2 = _mm512_gf2p8affine_epi64_epi8(_mm512_set1_epi64(0x0204080140201080), x1, 0);
__m512i x3 = _mm512_permutexvar_epi8(_mm512_setr_epi8(
44, 4, 52, 60, 36, 20, 28, 12,
47, 7, 55, 63, 39, 23, 31, 15,
46, 6, 54, 62, 38, 22, 30, 14,
45, 5, 53, 61, 37, 21, 29, 13,
41, 1, 49, 57, 33, 17, 25, 9,
42, 2, 50, 58, 34, 18, 26, 10,
43, 3, 51, 59, 35, 19, 27, 11,
40, 0, 48, 56, 32, 16, 24, 8), x2);
return x3;
}
void invert_permutation64(uint8_t* p, uint8_t* inv)
{
__m512i p0 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 0)));
__m512i p1 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 8)));
__m512i p2 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 16)));
__m512i p3 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 24)));
__m512i p4 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 32)));
__m512i p5 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 40)));
__m512i p6 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 48)));
__m512i p7 = _mm512_cvtepu8_epi64(_mm_loadl_epi64((__m128i*)(p + 56)));
// indexes to masks
__m512i m0 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p0);
__m512i m1 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p1);
__m512i m2 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p2);
__m512i m3 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p3);
__m512i m4 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p4);
__m512i m5 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p5);
__m512i m6 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p6);
__m512i m7 = _mm512_sllv_epi64(_mm512_set1_epi64(1), p7);
// 64x64 transpose
// swap 32x32 quadrants
__m512i s0 = _mm512_mask_blend_epi32(0xAAAA, m0, _mm512_slli_epi64(m4, 32));
__m512i s1 = _mm512_mask_blend_epi32(0xAAAA, m1, _mm512_slli_epi64(m5, 32));
__m512i s2 = _mm512_mask_blend_epi32(0xAAAA, m2, _mm512_slli_epi64(m6, 32));
__m512i s3 = _mm512_mask_blend_epi32(0xAAAA, m3, _mm512_slli_epi64(m7, 32));
__m512i s4 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m0, 32), m4);
__m512i s5 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m1, 32), m5);
__m512i s6 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m2, 32), m6);
__m512i s7 = _mm512_mask_blend_epi32(0xAAAA, _mm512_srli_epi64(m3, 32), m7);
// swap 16x16 quadrants
__m512i t0 = _mm512_mask_blend_epi16(0xAAAAAAAA, s0, _mm512_slli_epi32(s2, 16));
__m512i t1 = _mm512_mask_blend_epi16(0xAAAAAAAA, s1, _mm512_slli_epi32(s3, 16));
__m512i t2 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s0, 16), s2);
__m512i t3 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s1, 16), s3);
__m512i t4 = _mm512_mask_blend_epi16(0xAAAAAAAA, s4, _mm512_slli_epi32(s6, 16));
__m512i t5 = _mm512_mask_blend_epi16(0xAAAAAAAA, s5, _mm512_slli_epi32(s7, 16));
__m512i t6 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s4, 16), s6);
__m512i t7 = _mm512_mask_blend_epi16(0xAAAAAAAA, _mm512_srli_epi32(s5, 16), s7);
// swap 8x8 quadrants
__m512i u0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t0, _mm512_slli_epi16(t1, 8));
__m512i u1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t0, 8), t1);
__m512i u2 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t2, _mm512_slli_epi16(t3, 8));
__m512i u3 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t2, 8), t3);
__m512i u4 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t4, _mm512_slli_epi16(t5, 8));
__m512i u5 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t4, 8), t5);
__m512i u6 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, t6, _mm512_slli_epi16(t7, 8));
__m512i u7 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, _mm512_srli_epi16(t6, 8), t7);
// do the rest of the permute
__m512i v0 = weird_bit_permute(u0);
__m512i v1 = weird_bit_permute(u1);
__m512i v2 = weird_bit_permute(u2);
__m512i v3 = weird_bit_permute(u3);
__m512i v4 = weird_bit_permute(u4);
__m512i v5 = weird_bit_permute(u5);
__m512i v6 = weird_bit_permute(u6);
__m512i v7 = weird_bit_permute(u7);
// masks to indexes
__m512i i0 = _mm512_lzcnt_epi64(v0);
__m512i i1 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v1), 8);
__m512i i2 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v2), 16);
__m512i i3 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v3), 24);
__m512i i4 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v4), 32);
__m512i i5 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v5), 40);
__m512i i6 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v6), 48);
__m512i i7 = _mm512_slli_epi64(_mm512_lzcnt_epi64(v7), 56);
// pack
__m512i pck012 = _mm512_ternarylogic_epi64(i0, i1, i2, 0xFE);
__m512i pck345 = _mm512_ternarylogic_epi64(i3, i4, i5, 0xFE);
__m512i pck67 = _mm512_or_epi64(i6, i7);
__m512i pck = _mm512_ternarylogic_epi64(pck012, pck345, pck67, 0xFE);
pck = _mm512_permutexvar_epi8(_mm512_setr_epi8(
0, 8, 16, 24, 32, 40, 48, 56,
1, 9, 17, 25, 33, 41, 49, 57,
2, 10, 18, 26, 34, 42, 50, 58,
3, 11, 19, 27, 35, 43, 51, 59,
4, 12, 20, 28, 36, 44, 52, 60,
5, 13, 21, 29, 37, 45, 53, 61,
6, 14, 22, 30, 38, 46, 54, 62,
7, 15, 23, 31, 39, 47, 55, 63), pck);
_mm512_storeu_epi8(inv, _mm512_xor_epi64(pck, _mm512_set1_epi8(63)));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment