DavidBuchanan314/Makefile

## main.c
#include <stdio.h>

#include "utf8_incremental.h"

// a very page-aligned buffer, which maybe helps io performance
// (the first half is completely wasted, except for the last 3 bytes)
static char aligned_buf[0x20000] __attribute__ ((aligned (0x10000)));

int main()
{
	char *buf = aligned_buf + 0x10000;
	const size_t buf_len = 0x10000;

	int state = 0;
	for (;;) {
		size_t readlen = fread(buf, 1, buf_len, stdin);
		state = validate_utf8_incremental(state, buf, readlen);
		if (state < 0) break; // optional early-exit (would still work without this line, though)
		if (readlen < buf_len) break; // eof or io error
	}
	if (feof(stdin) && (state == 0)) {
		printf("Success!\n");
		return 0;
	}
	printf("failed :(\n%d\n", state);
	return -1;
}

## Makefile
CFLAGS := -Wall -Wextra -Wpedantic -O3
CXXFLAGS := ${CFLAGS}
LDFLAGS := -lsimdutf

main: main.o utf8_incremental.o

## utf8_incremental.cpp
#include <simdutf.h>

// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim
static inline size_t trim_partial_utf8(const char *input, size_t length) {
  if (length < 3) {
    switch (length) {
      case 2:
        if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
        if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
        return length;
      case 1:
        if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
        return length;
      case 0:
        return length;
    }
  }
  if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
  return length;
}

extern "C" {

#include "utf8_incremental.h"

/*
XXX: this API is janky! "buf" must have at least 3 spare bytes infront of it, which may be written to,
ready for the next iteration. This seems like a necessary sacrifice, to avoid large buffer copies.

Return value:
 <0 Definitely invalid UTF-8
==0 Definitely valid and complete string, up to this point.
 >0 Incomplete string, but maybe-valid thus far (represents number of bytes we think overflowed)
*/
int validate_utf8_incremental(int state, char *buf, size_t len)
{
	if ((state < 0) || (state > 3)) return -1; // propagate existing error, or raise a new one due to invalid state

	// calculate "adjusted" buffer info (the adjusted buffer will include previous overflow bytes, if present)
	char *buf_adj = buf - state;
	size_t len_adj = len + state;
	size_t partial_len = trim_partial_utf8(buf_adj, len_adj);

	if (!simdutf::validate_utf8(buf_adj, partial_len)) return -1;

	// copy overflow bytes ready for next iteration
	size_t overflow_count = len_adj - partial_len;
	//assert(overflow_count <= 3); // this will always be true
	for (size_t i=0; i<overflow_count; i++) {
		buf[-i-1] = buf[len-i-1]; // XXX: this indexes negatively into buf!
	}
	return overflow_count;
}

} // extern "C"

## utf8_incremental.h
int validate_utf8_incremental(int state, char *buf, size_t len);
	#include <stdio.h>

	#include "utf8_incremental.h"

	// a very page-aligned buffer, which maybe helps io performance
	// (the first half is completely wasted, except for the last 3 bytes)
	static char aligned_buf[0x20000] __attribute__ ((aligned (0x10000)));

	int main()
	{
	char *buf = aligned_buf + 0x10000;
	const size_t buf_len = 0x10000;

	int state = 0;
	for (;;) {
	size_t readlen = fread(buf, 1, buf_len, stdin);
	state = validate_utf8_incremental(state, buf, readlen);
	if (state < 0) break; // optional early-exit (would still work without this line, though)
	if (readlen < buf_len) break; // eof or io error
	}
	if (feof(stdin) && (state == 0)) {
	printf("Success!\n");
	return 0;
	}
	printf("failed :(\n%d\n", state);
	return -1;
	}
	CFLAGS := -Wall -Wextra -Wpedantic -O3
	CXXFLAGS := ${CFLAGS}
	LDFLAGS := -lsimdutf

	main: main.o utf8_incremental.o
	#include <simdutf.h>

	// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim
	static inline size_t trim_partial_utf8(const char *input, size_t length) {
	if (length < 3) {
	switch (length) {
	case 2:
	if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
	if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
	return length;
	case 1:
	if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
	return length;
	case 0:
	return length;
	}
	}
	if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
	if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
	if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
	return length;
	}

	extern "C" {

	#include "utf8_incremental.h"

	/*
	XXX: this API is janky! "buf" must have at least 3 spare bytes infront of it, which may be written to,
	ready for the next iteration. This seems like a necessary sacrifice, to avoid large buffer copies.

	Return value:
	<0 Definitely invalid UTF-8
	==0 Definitely valid and complete string, up to this point.
	>0 Incomplete string, but maybe-valid thus far (represents number of bytes we think overflowed)
	*/
	int validate_utf8_incremental(int state, char *buf, size_t len)
	{
	if ((state < 0) \|\| (state > 3)) return -1; // propagate existing error, or raise a new one due to invalid state

	// calculate "adjusted" buffer info (the adjusted buffer will include previous overflow bytes, if present)
	char *buf_adj = buf - state;
	size_t len_adj = len + state;
	size_t partial_len = trim_partial_utf8(buf_adj, len_adj);

	if (!simdutf::validate_utf8(buf_adj, partial_len)) return -1;

	// copy overflow bytes ready for next iteration
	size_t overflow_count = len_adj - partial_len;
	//assert(overflow_count <= 3); // this will always be true
	for (size_t i=0; i<overflow_count; i++) {
	buf[-i-1] = buf[len-i-1]; // XXX: this indexes negatively into buf!
	}
	return overflow_count;
	}

	} // extern "C"