Skip to content

Instantly share code, notes, and snippets.

@pankkor
Last active August 21, 2022 21:14
Show Gist options
  • Save pankkor/005d5135e42d0d1549b8b496e026e8d5 to your computer and use it in GitHub Desktop.
Save pankkor/005d5135e42d0d1549b8b496e026e8d5 to your computer and use it in GitHub Desktop.
Reverse char string in place using simd
// Reverse char string in place
//
// Build:
// clang -O2 -DTEST -mavx2 -Wall -Wpedantic -Wextra str_reverse.c -o str_reverse
// or
// clang -O2 -DTEST -mssse3 -Wall -Wpedantic -Wextra str_reverse.c -o str_reverse
//
#include <stdint.h> // uint64_t uint32_t
#include <stdlib.h> // abort()
#if defined(__i386__) || defined(__x86_64__)
#include <immintrin.h>
#elif defined(__ARM_NEON)
#include <arm_neon.h>
#endif
#define UNLIKELY(x) __builtin_expect(x, 0)
void str_reverse(char *str, int len) {
if (UNLIKELY(!str)) {
return;
}
char *l = str;
char *r = str + len;
// Reverse leftfmost and rightmost chunks and swap them
// moving towards center, trying bigger chunks first
//
// Given chunks:
// Iteration 1: [ c0 ][c1][c2][c3][ c4 ]
// ^l ^r (end of string)
// r - l > 32 * 2
// reverse 32-byte chunks at c0 (r0) and c4 (r4) and swap them
//
// Iteration 2: [ r4 ][c1][c2][c3][ r0 ]
// ^l ^r
// r - l < 32 * 2, but r - l > 16 * 2
// reverse 16-byte chunks at c1 (r1) and c3 (r3) and swap them
//
// Iteration 3: [ r4 ][r3][c2][r1][ r0 ]
// ^l ^r
// reverse leftovers at chunk c2 (r2)
//
// Result: [ r4][ r3 ][r2][ r1 ][ r0]
#ifdef __AVX2__
// 2x32-byte chunks
const __m256i m32 = _mm256_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
);
while (r - l >= 32 * 2) {
__m256i sl = _mm256_loadu_si256((__m256i *)l);
__m256i sr = _mm256_loadu_si256((__m256i *)(r - 32));
sl = _mm256_shuffle_epi8(sl, m32);
sr = _mm256_shuffle_epi8(sr, m32);
sl = _mm256_permute2x128_si256(sl, sl, 1);
sr = _mm256_permute2x128_si256(sr, sr, 1);
_mm256_storeu_si256((__m256i *)l, sr);
_mm256_storeu_si256((__m256i *)(r - 32), sl);
l += 32;
r -= 32;
}
#endif // #ifdef __AVX2__
#ifdef __SSSE3__
// 2x16-byte chunks
const __m128i m16 = _mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
);
while (r - l >= 16 * 2) {
__m128i sl = _mm_loadu_si128((__m128i *)l);
__m128i sr = _mm_loadu_si128((__m128i *)(r - 16));
sl = _mm_shuffle_epi8(sl, m16);
sr = _mm_shuffle_epi8(sr, m16);
_mm_storeu_si128((__m128i *)l, sr);
_mm_storeu_si128((__m128i *)(r - 16), sl);
l += 16;
r -= 16;
}
#endif // #ifdef __SSSE3__
#ifdef __ARM_NEON
while (r - l >= 16 * 2) {
uint8x16_t sl = vld1q_u8((uint8_t *)l);
uint8x16_t sr = vld1q_u8((uint8_t *)(r - 16));
sl = vrev64q_u8(sl); // reverse 2x64-bit lanes
sr = vrev64q_u8(sr);
sl = vextq_u8(sl, sl, 8); // swap 64-bit lanes
sr = vextq_u8(sr, sr, 8);
vst1q_u8((uint8_t *)l, sr);
vst1q_u8((uint8_t *)(r - 16), sl);
l += 16;
r -= 16;
}
#endif // ifdef __ARM_NEON
// 2x8-byte chunks
while (r - l >= 8 * 2) {
uint64_t *pvl = (uint64_t *)l;
uint64_t *pvr = (uint64_t *)(r - 8);
uint64_t vl = __builtin_bswap64(*pvl);
uint64_t vr = __builtin_bswap64(*pvr);
*pvl = vr;
*pvr = vl;
l += 8;
r -= 8;
}
// 2x4-byte chunks
if (r - l >= 4 * 2) {
uint32_t *pvl = (uint32_t *)l;
uint32_t *pvr = (uint32_t *)(r - 4);
uint32_t vl = __builtin_bswap32(*pvl);
uint32_t vr = __builtin_bswap32(*pvr);
*pvl = vr;
*pvr = vl;
l += 4;
r -= 4;
}
// per byte swap
--r;
while (l < r) {
char tmp = *l;
*l = *r;
*r = tmp;
++l;
--r;
}
}
#ifdef TEST
#include <string.h> // strncmp()
#define ALIGNED(x) __attribute__((aligned(x)))
int main(void) {
enum { s_size = 1024 };
// ALIGNED is not strictly necessary as str_reverse()
// makes unaligned loads/stores
ALIGNED(128) char s[s_size];
ALIGNED(128) char expected[s_size];
for (int len = 0; len < s_size; ++len) {
// init
for (int i = 0; i < len; ++i) {
s[i] = 32 + i % (128 - 32);
expected[len - i - 1] = s[i];
}
s[len] = '\0';
expected[len] = '\0';
str_reverse(s, len);
// cmp including null terminator
if (strncmp(s, expected, len) != 0 || s[len] != '\0') {
abort();
}
}
return 0;
}
#endif // #ifdef TEST
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment