Last active
August 21, 2022 21:14
-
-
Save pankkor/005d5135e42d0d1549b8b496e026e8d5 to your computer and use it in GitHub Desktop.
Reverse char string in place using simd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Reverse char string in place | |
// | |
// Build: | |
// clang -O2 -DTEST -mavx2 -Wall -Wpedantic -Wextra str_reverse.c -o str_reverse | |
// or | |
// clang -O2 -DTEST -mssse3 -Wall -Wpedantic -Wextra str_reverse.c -o str_reverse | |
// | |
#include <stdint.h> // uint64_t uint32_t | |
#include <stdlib.h> // abort() | |
#if defined(__i386__) || defined(__x86_64__) | |
#include <immintrin.h> | |
#elif defined(__ARM_NEON) | |
#include <arm_neon.h> | |
#endif | |
#define UNLIKELY(x) __builtin_expect(x, 0) | |
void str_reverse(char *str, int len) { | |
if (UNLIKELY(!str)) { | |
return; | |
} | |
char *l = str; | |
char *r = str + len; | |
// Reverse leftfmost and rightmost chunks and swap them | |
// moving towards center, trying bigger chunks first | |
// | |
// Given chunks: | |
// Iteration 1: [ c0 ][c1][c2][c3][ c4 ] | |
// ^l ^r (end of string) | |
// r - l > 32 * 2 | |
// reverse 32-byte chunks at c0 (r0) and c4 (r4) and swap them | |
// | |
// Iteration 2: [ r4 ][c1][c2][c3][ r0 ] | |
// ^l ^r | |
// r - l < 32 * 2, but r - l > 16 * 2 | |
// reverse 16-byte chunks at c1 (r1) and c3 (r3) and swap them | |
// | |
// Iteration 3: [ r4 ][r3][c2][r1][ r0 ] | |
// ^l ^r | |
// reverse leftovers at chunk c2 (r2) | |
// | |
// Result: [ r4][ r3 ][r2][ r1 ][ r0] | |
#ifdef __AVX2__ | |
// 2x32-byte chunks | |
const __m256i m32 = _mm256_set_epi8( | |
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | |
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
); | |
while (r - l >= 32 * 2) { | |
__m256i sl = _mm256_loadu_si256((__m256i *)l); | |
__m256i sr = _mm256_loadu_si256((__m256i *)(r - 32)); | |
sl = _mm256_shuffle_epi8(sl, m32); | |
sr = _mm256_shuffle_epi8(sr, m32); | |
sl = _mm256_permute2x128_si256(sl, sl, 1); | |
sr = _mm256_permute2x128_si256(sr, sr, 1); | |
_mm256_storeu_si256((__m256i *)l, sr); | |
_mm256_storeu_si256((__m256i *)(r - 32), sl); | |
l += 32; | |
r -= 32; | |
} | |
#endif // #ifdef __AVX2__ | |
#ifdef __SSSE3__ | |
// 2x16-byte chunks | |
const __m128i m16 = _mm_set_epi8( | |
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
); | |
while (r - l >= 16 * 2) { | |
__m128i sl = _mm_loadu_si128((__m128i *)l); | |
__m128i sr = _mm_loadu_si128((__m128i *)(r - 16)); | |
sl = _mm_shuffle_epi8(sl, m16); | |
sr = _mm_shuffle_epi8(sr, m16); | |
_mm_storeu_si128((__m128i *)l, sr); | |
_mm_storeu_si128((__m128i *)(r - 16), sl); | |
l += 16; | |
r -= 16; | |
} | |
#endif // #ifdef __SSSE3__ | |
#ifdef __ARM_NEON | |
while (r - l >= 16 * 2) { | |
uint8x16_t sl = vld1q_u8((uint8_t *)l); | |
uint8x16_t sr = vld1q_u8((uint8_t *)(r - 16)); | |
sl = vrev64q_u8(sl); // reverse 2x64-bit lanes | |
sr = vrev64q_u8(sr); | |
sl = vextq_u8(sl, sl, 8); // swap 64-bit lanes | |
sr = vextq_u8(sr, sr, 8); | |
vst1q_u8((uint8_t *)l, sr); | |
vst1q_u8((uint8_t *)(r - 16), sl); | |
l += 16; | |
r -= 16; | |
} | |
#endif // ifdef __ARM_NEON | |
// 2x8-byte chunks | |
while (r - l >= 8 * 2) { | |
uint64_t *pvl = (uint64_t *)l; | |
uint64_t *pvr = (uint64_t *)(r - 8); | |
uint64_t vl = __builtin_bswap64(*pvl); | |
uint64_t vr = __builtin_bswap64(*pvr); | |
*pvl = vr; | |
*pvr = vl; | |
l += 8; | |
r -= 8; | |
} | |
// 2x4-byte chunks | |
if (r - l >= 4 * 2) { | |
uint32_t *pvl = (uint32_t *)l; | |
uint32_t *pvr = (uint32_t *)(r - 4); | |
uint32_t vl = __builtin_bswap32(*pvl); | |
uint32_t vr = __builtin_bswap32(*pvr); | |
*pvl = vr; | |
*pvr = vl; | |
l += 4; | |
r -= 4; | |
} | |
// per byte swap | |
--r; | |
while (l < r) { | |
char tmp = *l; | |
*l = *r; | |
*r = tmp; | |
++l; | |
--r; | |
} | |
} | |
#ifdef TEST | |
#include <string.h> // strncmp() | |
#define ALIGNED(x) __attribute__((aligned(x))) | |
int main(void) { | |
enum { s_size = 1024 }; | |
// ALIGNED is not strictly necessary as str_reverse() | |
// makes unaligned loads/stores | |
ALIGNED(128) char s[s_size]; | |
ALIGNED(128) char expected[s_size]; | |
for (int len = 0; len < s_size; ++len) { | |
// init | |
for (int i = 0; i < len; ++i) { | |
s[i] = 32 + i % (128 - 32); | |
expected[len - i - 1] = s[i]; | |
} | |
s[len] = '\0'; | |
expected[len] = '\0'; | |
str_reverse(s, len); | |
// cmp including null terminator | |
if (strncmp(s, expected, len) != 0 || s[len] != '\0') { | |
abort(); | |
} | |
} | |
return 0; | |
} | |
#endif // #ifdef TEST |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment