Skip to content

Instantly share code, notes, and snippets.

@DavidBuchanan314
Last active January 16, 2024 12:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DavidBuchanan314/19941d1c9f7182cf2f5189bf8edbd00c to your computer and use it in GitHub Desktop.
Save DavidBuchanan314/19941d1c9f7182cf2f5189bf8edbd00c to your computer and use it in GitHub Desktop.
simdutf incremental utf8 validation (proof-of-concept, not rigorously tested, see https://github.com/simdutf/simdutf/issues/361 )
#include <stdio.h>
#include "utf8_incremental.h"
// a very page-aligned buffer, which maybe helps io performance
// (the first half is completely wasted, except for the last 3 bytes)
static char aligned_buf[0x20000] __attribute__ ((aligned (0x10000)));
int main()
{
char *buf = aligned_buf + 0x10000;
const size_t buf_len = 0x10000;
int state = 0;
for (;;) {
size_t readlen = fread(buf, 1, buf_len, stdin);
state = validate_utf8_incremental(state, buf, readlen);
if (state < 0) break; // optional early-exit (would still work without this line, though)
if (readlen < buf_len) break; // eof or io error
}
if (feof(stdin) && (state == 0)) {
printf("Success!\n");
return 0;
}
printf("failed :(\n%d\n", state);
return -1;
}
CFLAGS := -Wall -Wextra -Wpedantic -O3
CXXFLAGS := ${CFLAGS}
LDFLAGS := -lsimdutf
main: main.o utf8_incremental.o
#include <simdutf.h>
// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim
static inline size_t trim_partial_utf8(const char *input, size_t length) {
if (length < 3) {
switch (length) {
case 2:
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
return length;
case 1:
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return length;
case 0:
return length;
}
}
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
return length;
}
extern "C" {
#include "utf8_incremental.h"
/*
XXX: this API is janky! "buf" must have at least 3 spare bytes infront of it, which may be written to,
ready for the next iteration. This seems like a necessary sacrifice, to avoid large buffer copies.
Return value:
<0 Definitely invalid UTF-8
==0 Definitely valid and complete string, up to this point.
>0 Incomplete string, but maybe-valid thus far (represents number of bytes we think overflowed)
*/
int validate_utf8_incremental(int state, char *buf, size_t len)
{
if ((state < 0) || (state > 3)) return -1; // propagate existing error, or raise a new one due to invalid state
// calculate "adjusted" buffer info (the adjusted buffer will include previous overflow bytes, if present)
char *buf_adj = buf - state;
size_t len_adj = len + state;
size_t partial_len = trim_partial_utf8(buf_adj, len_adj);
if (!simdutf::validate_utf8(buf_adj, partial_len)) return -1;
// copy overflow bytes ready for next iteration
size_t overflow_count = len_adj - partial_len;
//assert(overflow_count <= 3); // this will always be true
for (size_t i=0; i<overflow_count; i++) {
buf[-i-1] = buf[len-i-1]; // XXX: this indexes negatively into buf!
}
return overflow_count;
}
} // extern "C"
int validate_utf8_incremental(int state, char *buf, size_t len);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment