Skip to content

Instantly share code, notes, and snippets.

@degski
Last active October 14, 2023 22:57
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save degski/b5fbac1ec6c8200d1d8ad102f89df89f to your computer and use it in GitHub Desktop.
Save degski/b5fbac1ec6c8200d1d8ad102f89df89f to your computer and use it in GitHub Desktop.
lane-crossing shift and rotate instructions in AVX2
// MIT License
//
// Copyright (c) 2018 degski
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "shift_rotate_avx2.hpp"
#ifdef __AVX2__
__m256i left_shift_000_063 ( __m256i a, int n ) { // 6
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 0 ) ), _MM_SHUFFLE ( 3, 3, 3, 0 ) ) );
}
__m256i left_shift_064_127 ( __m256i a, int n ) { // 7
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 0 ) );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 3, 0 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 3, 0, 0 ) ); // 6
return _mm256_or_si256 ( f, g );
}
__m256i left_shift_128_191 ( __m256i a, int n ) { // 7
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 0, 0 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 0, 0, 0 ) );
return _mm256_or_si256 ( f, g );
}
__m256i left_shift_192_255 ( __m256i a, int n ) { // 5
return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_slli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 0 ) ), n ), _MM_SHUFFLE ( 3, 0, 0, 0 ) );
}
__m256i _mm256_sli_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? left_shift_000_063 ( a, n ) : left_shift_064_127 ( a, n % 64 );
else return n < 192 ? left_shift_128_191 ( a, n % 64 ) : left_shift_192_255 ( a, n % 64 );
}
__m256i right_shift_000_063 ( __m256i a, int n ) { // 6
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ), _MM_SHUFFLE ( 0, 3, 3, 3 ) ) );
}
__m256i right_shift_064_127 ( __m256i a, int n ) { // 7
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 3, 2, 1 ) );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 3, 3, 2 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 3, 3, 3 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
return _mm256_or_si256 ( f, g );
}
__m256i right_shift_128_191 ( __m256i a, int n ) { // 7
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 2, 3, 2 ) );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 2, 1, 3 ) );
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 0, 3 ) );
return _mm256_or_si256 ( f, g );
}
__m256i right_shift_192_255 ( __m256i a, int n ) { // 5
return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_srli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 3 ) ), n ), _MM_SHUFFLE ( 0, 0, 0, 3 ) );
}
__m256i _mm256_sri_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? right_shift_000_063 ( a, n ) : right_shift_064_127 ( a, n % 64 );
else return n < 192 ? right_shift_128_191 ( a, n % 64 ) : right_shift_192_255 ( a, n % 64 );
}
__m256i left_rotate_000_063 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
}
__m256i left_rotate_064_127 ( __m256i a, int n ) { // 6
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
return _mm256_or_si256 ( d, e );
}
__m256i left_rotate_128_191 ( __m256i a, int n ) { // 6
__m256i b = _mm256_slli_epi64 ( a, n );
__m256i c = _mm256_srli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
return _mm256_or_si256 ( d, e );
}
__m256i left_rotate_192_255 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
}
__m256i _mm256_rli_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? left_rotate_000_063 ( a, n ) : left_rotate_064_127 ( a, n % 64 );
else return n < 192 ? left_rotate_128_191 ( a, n % 64 ) : left_rotate_192_255 ( a, n % 64 );
}
__m256i right_rotate_000_063 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
}
__m256i right_rotate_064_127 ( __m256i a, int n ) { // 6
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
return _mm256_or_si256 ( d, e );
}
__m256i right_rotate_128_191 ( __m256i a, int n ) { // 6
__m256i b = _mm256_srli_epi64 ( a, n );
__m256i c = _mm256_slli_epi64 ( a, 64 - n );
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
return _mm256_or_si256 ( d, e );
}
__m256i right_rotate_192_255 ( __m256i a, int n ) { // 5
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
}
__m256i _mm256_rri_si256 ( __m256i a, int n ) {
if ( n < 128 ) return n < 64 ? right_rotate_000_063 ( a, n ) : right_rotate_064_127 ( a, n % 64 );
else return n < 192 ? right_rotate_128_191 ( a, n % 64 ) : right_rotate_192_255 ( a, n % 64 );
}
#endif // __AVX2__
// MIT License
//
// Copyright (c) 2018 degski
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <immintrin.h>
#ifdef __AVX2__
__m256i _mm256_sli_si256 ( __m256i, int );
__m256i _mm256_sri_si256 ( __m256i, int );
__m256i _mm256_rli_si256 ( __m256i, int );
__m256i _mm256_rri_si256 ( __m256i, int );
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment