Skip to content

Instantly share code, notes, and snippets.

@kccqzy
Created February 1, 2019 08:15
Show Gist options
  • Save kccqzy/0f0aedc3f6789229170601f804ae13a0 to your computer and use it in GitHub Desktop.
Save kccqzy/0f0aedc3f6789229170601f804ae13a0 to your computer and use it in GitHub Desktop.
Test UTF-8 decoder using PEXT instruction
#include <stdlib.h>
unsigned pext_utf8_decode(unsigned char*& buf) {
unsigned next4;
__builtin_memcpy(&next4, buf, 4);
next4 = __builtin_bswap32(next4);
if (__builtin_expect(!!(next4 >> 31), 0)) {
// multi-byte handling
unsigned r;
if (((next4 >> 16) & 0b11100000'11000000) == 0b11000000'10000000) {
__asm__("pext %1, %2, %0"
: "=r"(r)
: "r"(0b00011111'00111111'00000000'00000000u), "r"(next4));
buf += 2;
return r;
} else if (((next4 >> 8) & 0b11110000'11000000'11000000) ==
0b11100000'10000000'10000000) {
__asm__("pext %1, %2, %0"
: "=r"(r)
: "r"(0b00001111'00111111'00111111'00000000u), "r"(next4));
buf += 3;
return r;
} else if ((next4 & 0b11111000'11000000'11000000'11000000u) ==
0b11110000'10000000'10000000'10000000u) {
__asm__("pext %1, %2, %0"
: "=r"(r)
: "r"(0b00000111'00111111'00111111'00111111u), "r"(next4));
buf += 4;
return r;
} else {
abort();
__builtin_unreachable();
}
} else {
buf++;
return next4 >> 24;
}
}
#include <stdio.h>
#include <assert.h>
int main() {
{
unsigned char b1[] = {0xe2, 0x82, 0xac, 0};
unsigned char* buf = b1;
assert(pext_utf8_decode(buf) == 0x20ac);
assert(buf == b1 + 3);
}
{
unsigned char b2[] = {'g', 0, 0, 0};
unsigned char* buf = b2;
assert(pext_utf8_decode(buf) == (unsigned) 'g');
assert(buf == b2 + 1);
}
{
unsigned char b3[] = {0xc2, 0xa2, 0, 0};
unsigned char* buf = b3;
assert(pext_utf8_decode(buf) == 0xa2);
assert(buf == b3 + 2);
}
{
unsigned char b4[] = {0xe0, 0xa4, 0xb9, 0};
unsigned char* buf = b4;
assert(pext_utf8_decode(buf) == 0x939);
assert(buf == b4 + 3);
}
{
unsigned char b5[] = {0xe2, 0x82, 0xac, 0};
unsigned char* buf = b5;
assert(pext_utf8_decode(buf) == 0x20ac);
assert(buf == b5 + 3);
}
{
unsigned char b6[] = {0xf0, 0x90, 0x8d, 0x88, 0};
unsigned char* buf = b6;
assert(pext_utf8_decode(buf) == 0x10348);
assert(buf == b6 + 4);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment