Skip to content

Instantly share code, notes, and snippets.

@addaleax
Created December 5, 2022 00:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save addaleax/601f5380a53257b80f373ba53376181f to your computer and use it in GitHub Desktop.
Save addaleax/601f5380a53257b80f373ba53376181f to your computer and use it in GitHub Desktop.
ISO-8859-1 to UTF-8 byte length counter
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <emmintrin.h>
#include <immintrin.h>
// header
extern "C" {
size_t utf8_length_for_latin1_str(const uint8_t* latin1_str, size_t latin1_len);
}
// source
static size_t utf8_length_for_latin1_str_nosimd(const uint8_t* str, size_t len) {
size_t ret = len;
for (size_t i = 0; i < len; i++) {
ret += str[i] >> 7;
}
return ret;
}
__attribute__((target("sse2")))
static size_t utf8_length_for_latin1_str_sse2(const uint8_t* str, size_t len) {
size_t ret = len - (len % 16);
size_t i;
for (i = 0; len - i >= 16; i += 16) {
__m128i si128 = _mm_loadu_si128((const __m128i*)(str + i));
ret += __builtin_popcount(_mm_movemask_epi8(si128));
}
return ret + utf8_length_for_latin1_str_nosimd(str + i, len - i);
}
__attribute__((target("avx2")))
static size_t utf8_length_for_latin1_str_avx2(const uint8_t* str, size_t len) {
size_t ret = len - (len % 32);
size_t i;
for (i = 0; len - i >= 32; i += 32) {
__m256i si256 = _mm256_loadu_si256((const __m256i*)(str + i));
ret += __builtin_popcount(_mm256_movemask_epi8(si256));
}
return ret + utf8_length_for_latin1_str_sse2(str + i, len - i);
}
extern "C" {
size_t(* resolve_utf8_length_for_latin1_str())(const uint8_t*, size_t) {
__builtin_cpu_init ();
if (__builtin_cpu_supports ("avx2"))
return utf8_length_for_latin1_str_avx2;
else if (__builtin_cpu_supports ("sse2"))
return utf8_length_for_latin1_str_sse2;
else
return utf8_length_for_latin1_str_nosimd;
}
size_t utf8_length_for_latin1_str(const uint8_t* str, size_t len) __attribute__ ((ifunc ("resolve_utf8_length_for_latin1_str")));
}
// test
#if 1
#include <assert.h>
int main() {
auto get_length = [](const char* ustr) {
return utf8_length_for_latin1_str((uint8_t*)ustr, strlen(ustr));
};
assert(get_length("") == 0);
assert(get_length("hello") == 5);
assert(get_length("hellooooooooooooooooooooooooooooooooo") == 37);
assert(get_length("hell\xf6") == 6);
assert(get_length("hell\xf6" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 36);
assert(get_length("hellaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xf6") == 37);
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment