Skip to content

Instantly share code, notes, and snippets.

@addaleax
Last active December 1, 2022 00:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save addaleax/5e26f220df28073254e44ad78ab4520a to your computer and use it in GitHub Desktop.
Save addaleax/5e26f220df28073254e44ad78ab4520a to your computer and use it in GitHub Desktop.
UTF-8 to ISO-8859-1 [aka Latin1] converters with gcc-style SIMD C++
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <emmintrin.h>
#include <immintrin.h>
// header
extern "C" {
// Returns true if str can be converted to latin1.
bool utf8_can_be_converted_to_latin1(const uint8_t* str, size_t len);
// Modifies str in place! Only safe if utf8_can_be_converted_to_latin1() has returned true before.
size_t utf8_convert_to_latin1(uint8_t* str, size_t len);
}
// source
static bool utf8_can_be_converted_to_latin1_nosimd(const uint8_t* str, size_t len) {
for (size_t i = 0; i < len; i++) {
if (str[i] > 0xC3) return false;
}
return true;
}
__attribute__((target("sse2")))
static bool utf8_can_be_converted_to_latin1_sse2(const uint8_t* str, size_t len) {
static const __m128i mask = _mm_set1_epi8(0xC3);
for (size_t i = 0; i < len;) {
if (len - i >= 16) {
__m128i si128 = _mm_loadu_si128((const __m128i*)(str + i));
si128 = _mm_cmpgt_epi8(mask, si128);
if (_mm_movemask_epi8(si128)) return false;
i += 16;
} else return utf8_can_be_converted_to_latin1_nosimd(str + i, len - i);
}
return true;
}
__attribute__((target("avx2")))
static bool utf8_can_be_converted_to_latin1_avx2(const uint8_t* str, size_t len) {
static const __m256i mask = _mm256_set1_epi8(0xC3);
for (size_t i = 0; i < len;) {
if (len - i >= 32) {
__m256i si256 = _mm256_loadu_si256((const __m256i*)(str + i));
si256 = _mm256_cmpgt_epi8(mask, si256);
if (_mm256_movemask_epi8(si256)) return false;
i += 32;
} else return utf8_can_be_converted_to_latin1_sse2(str + i, len - i);
}
return true;
}
static size_t utf8_convert_to_latin1_nosimd(uint8_t* str, size_t len) {
uint8_t* read = str,* write = str;
for (; read < str + len; ) {
#define UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD \
if ((*read & 0xC0) == 0xC0 && read + 1 < str + len) { \
*write = ((read[0] & 0x1F) << 6) | (read[1] & 0x3F); \
read += 2; \
} else { \
read++; \
} \
write++;
UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD
}
return write - str;
}
static size_t utf8_convert_to_latin1_sse2(uint8_t* str, size_t len) {
uint8_t* read = str,* write = str;
for (; read < str + len; ) {
if (read + 16 < str + len) {
if (!_mm_movemask_epi8(_mm_loadu_si128((const __m128i*)read))) {
read += 16;
write += 16;
continue;
}
}
UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD
}
return write - str;
}
static size_t utf8_convert_to_latin1_avx2(uint8_t* str, size_t len) {
uint8_t* read = str,* write = str;
for (; read < str + len; ) {
if (read + 32 < str + len) {
if (!_mm256_movemask_epi8(_mm256_loadu_si256((const __m256i*)read))) {
read += 32;
write += 32;
continue;
}
}
if (read + 16 < str + len) {
if (!_mm_movemask_epi8(_mm_loadu_si128((const __m128i*)read))) {
read += 16;
write += 16;
continue;
}
}
UTF8_CONVERT_TO_LATIN1_READ_WRITE_NOSIMD
}
return write - str;
}
extern "C" {
bool(* resolve_can_convert())(const uint8_t*, size_t) {
__builtin_cpu_init ();
if (__builtin_cpu_supports ("avx2"))
return utf8_can_be_converted_to_latin1_avx2;
else if (__builtin_cpu_supports ("sse2"))
return utf8_can_be_converted_to_latin1_sse2;
else
return utf8_can_be_converted_to_latin1_nosimd;
}
size_t(* resolve_convert())(uint8_t*, size_t) {
__builtin_cpu_init ();
if (__builtin_cpu_supports ("avx2"))
return utf8_convert_to_latin1_avx2;
else if (__builtin_cpu_supports ("sse2"))
return utf8_convert_to_latin1_sse2;
else
return utf8_convert_to_latin1_nosimd;
}
bool utf8_can_be_converted_to_latin1(const uint8_t* str, size_t len) __attribute__ ((ifunc ("resolve_can_convert")));
size_t utf8_convert_to_latin1(uint8_t* str, size_t len) __attribute__ ((ifunc ("resolve_convert")));
}
// test
#if 1
#include <assert.h>
#include <string>
int main() {
auto can_convert = [](const char* ustr) {
return utf8_can_be_converted_to_latin1((uint8_t*)ustr, strlen(ustr));
};
assert(can_convert("") == true);
assert(can_convert("hello") == true);
assert(can_convert("hellooooooooooooooooooooooooooooooooo") == true);
assert(can_convert("hellö") == true);
assert(can_convert("hellā") == false);
assert(can_convert("hellāaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == false);
assert(can_convert("hellaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaā") == false);
auto convert = [](const char* ustr) {
std::string copy(ustr);
size_t newlen = utf8_convert_to_latin1((uint8_t*)&copy[0], copy.size());
copy.resize(newlen);
return copy;
};
assert(convert("hello") == "hello");
assert(convert("hellö") == "hell\xf6");
assert(convert("hellllllllllllllllllllllllllllllllllllllö") == "hellllllllllllllllllllllllllllllllllllll\xf6");
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment