Last active
January 16, 2024 12:40
-
-
Save DavidBuchanan314/19941d1c9f7182cf2f5189bf8edbd00c to your computer and use it in GitHub Desktop.
simdutf incremental utf8 validation (proof-of-concept, not rigorously tested, see https://github.com/simdutf/simdutf/issues/361 )
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include "utf8_incremental.h" | |
// a very page-aligned buffer, which maybe helps io performance | |
// (the first half is completely wasted, except for the last 3 bytes) | |
static char aligned_buf[0x20000] __attribute__ ((aligned (0x10000))); | |
int main() | |
{ | |
char *buf = aligned_buf + 0x10000; | |
const size_t buf_len = 0x10000; | |
int state = 0; | |
for (;;) { | |
size_t readlen = fread(buf, 1, buf_len, stdin); | |
state = validate_utf8_incremental(state, buf, readlen); | |
if (state < 0) break; // optional early-exit (would still work without this line, though) | |
if (readlen < buf_len) break; // eof or io error | |
} | |
if (feof(stdin) && (state == 0)) { | |
printf("Success!\n"); | |
return 0; | |
} | |
printf("failed :(\n%d\n", state); | |
return -1; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CFLAGS := -Wall -Wextra -Wpedantic -O3 | |
CXXFLAGS := ${CFLAGS} | |
LDFLAGS := -lsimdutf | |
main: main.o utf8_incremental.o |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <simdutf.h> | |
// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim | |
static inline size_t trim_partial_utf8(const char *input, size_t length) { | |
if (length < 3) { | |
switch (length) { | |
case 2: | |
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left | |
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left | |
return length; | |
case 1: | |
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left | |
return length; | |
case 0: | |
return length; | |
} | |
} | |
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left | |
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left | |
if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left | |
return length; | |
} | |
extern "C" { | |
#include "utf8_incremental.h" | |
/* | |
XXX: this API is janky! "buf" must have at least 3 spare bytes infront of it, which may be written to, | |
ready for the next iteration. This seems like a necessary sacrifice, to avoid large buffer copies. | |
Return value: | |
<0 Definitely invalid UTF-8 | |
==0 Definitely valid and complete string, up to this point. | |
>0 Incomplete string, but maybe-valid thus far (represents number of bytes we think overflowed) | |
*/ | |
int validate_utf8_incremental(int state, char *buf, size_t len) | |
{ | |
if ((state < 0) || (state > 3)) return -1; // propagate existing error, or raise a new one due to invalid state | |
// calculate "adjusted" buffer info (the adjusted buffer will include previous overflow bytes, if present) | |
char *buf_adj = buf - state; | |
size_t len_adj = len + state; | |
size_t partial_len = trim_partial_utf8(buf_adj, len_adj); | |
if (!simdutf::validate_utf8(buf_adj, partial_len)) return -1; | |
// copy overflow bytes ready for next iteration | |
size_t overflow_count = len_adj - partial_len; | |
//assert(overflow_count <= 3); // this will always be true | |
for (size_t i=0; i<overflow_count; i++) { | |
buf[-i-1] = buf[len-i-1]; // XXX: this indexes negatively into buf! | |
} | |
return overflow_count; | |
} | |
} // extern "C" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int validate_utf8_incremental(int state, char *buf, size_t len); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment