RealNeGate/my_c_lexer.c

## my_c_lexer.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <x86intrin.h>

enum {
    TOKEN_ACCESSOR = '.',

    TOKEN_PLUS = '+',
    TOKEN_MINUS = '-',
    TOKEN_TIMES = '*',
    TOKEN_SLASH = '/',
    TOKEN_PERCENT = '%',
    TOKEN_ASSIGN = '=',

    TOKEN_HASH = '#',

    TOKEN_BRACKET_OPEN = '[',
    TOKEN_BRACKET_CLOSE = ']',

    TOKEN_PAREN_OPEN = '(',
    TOKEN_PAREN_CLOSE = ')',

    TOKEN_BRACE_OPEN = '{',
    TOKEN_BRACE_CLOSE = '}',

    TOKEN_IDENTIFIER = 256,
	TOKEN_NUMBER,
    TOKEN_STRING,

    TOKEN_INVALID,

    TOKEN_INCREMENT,    /* ++  */
    TOKEN_DECREMENT,    /* --  */

    TOKEN_ARROW,        /* ->  */

    TOKEN_DOUBLE_HASH   = '#' + 256,  /* ##  */
    TOKEN_PLUS_EQUAL    = '+' + 256,  /* +=  */
    TOKEN_MINUS_EQUAL   = '-' + 256,  /* -=  */
    TOKEN_TIMES_EQUAL   = '*' + 256,  /* *=  */
    TOKEN_SLASH_EQUAL   = '/' + 256,  /* /=  */
    TOKEN_PERCENT_EQUAL = '%' + 256,  /* /=  */
    TOKEN_EQUALITY      = '=' + 256,  /* ==  */
};

typedef struct Lexer {
    const char* current;

    // current token info
    int token_type;
    const char* token_start;
    const char* token_end;
} Lexer;

static _Alignas(64) uint8_t identifier_char_tbl[] = {
	0x00,0x00,0x00,0x00,
	0x10,0x00,0xff,0x03,
	0xfe,0xff,0xff,0x87,
	0xfe,0xff,0xff,0x07,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00
};

__attribute__((always_inline))
static bool is_identifier_not_first_char(char ch) {
	size_t i = ch;
	size_t index = i / 8;
	size_t shift = i & 7;
	return (identifier_char_tbl[index] >> shift) & 1;
}

static _Alignas(64) uint8_t space_char_tbl[] = {
	0x00,0x26,0x00,0x00,
	0x01,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00
};

__attribute__((always_inline))
static bool is_space(char ch) {
	size_t i = ch;
	size_t index = i / 8;
	size_t shift = i & 7;
	return (space_char_tbl[index] >> shift) & 1;
}

static void read(Lexer* restrict l) {
    const char* current = l->current;

    // branchless space skip
    current += (*current == ' ');

    redo_lex:;
    const char* start = current;
    switch (*start) {
        case '\0':
        l->token_type = '\0';
        break;
        case '\r':
        current++;

        // it's expected these are next to each other because
        // Windows, fast path a fallthrough
        if (*current != '\n') goto redo_lex;
        case '\n': {
            current++;

            // Do a branchless SIMD skip of up to 16 tabs after a newline.
            __m128i chars = _mm_loadu_si128((__m128i *)current);
            int len = __builtin_ffs(~_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\t'))));
            current += (len ? len - 1 : 0);
            goto redo_lex;
        }
        case ' ':
        case '\t':
        case '\v':
        // slow path
        do {
            current++;
        } while (is_space(*current));
        goto redo_lex;
        case '\"':
        current++;

        do {
            __m128i chars = _mm_loadu_si128((__m128i *)current);
            int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"'))));

            if (len) {
                current += len;
                if (current[-1] == '\"' && current[-2] != '\\') break;
            } else {
                current += 16;
            }
        } while (*current);

        l->token_type = TOKEN_STRING;
        break;
        case '+':
        current++;

        if (*current == '-') {
            current++;
            l->token_type = TOKEN_DECREMENT;
            break;
        } else if (*current == '=') {
            current++;
            l->token_type = TOKEN_PLUS_EQUAL;
            break;
        }

        l->token_type = TOKEN_PLUS;
        break;
        case '-':
        current++;

        if (*current == '-') {
            current++;
            l->token_type = TOKEN_DECREMENT;
            break;
        } else if (*current == '>') {
            current++;
            l->token_type = TOKEN_ARROW;
            break;
        }

        l->token_type = TOKEN_MINUS;
        break;
        case '*':
        case '/':
        case '&':
        case '%':
        case '|':
        case '^':
        case '=': {
            int t = *current++;
            if (*current == '=') {
                current++;
                t += 256;
            }

            l->token_type = t;
            break;
        }
        case '#':
        current++;

        if (*current == '#') {
            current++;
            l->token_type = TOKEN_DOUBLE_HASH;
            break;
        }

        l->token_type = TOKEN_HASH;
        goto redo_lex;
        case '~':
        case '!':
        case '.':
        case ';':
        case ',':
        case '(':
        case ')':
        case '[':
        case ']':
        case '{':
        case '}':
        current++;
        l->token_type = *start;
        break;
        case 'A' ... 'Z':
        case 'a' ... 'z':
        case '_': case '$': {
            __builtin_prefetch(identifier_char_tbl);
            do {
                current++;
            } while (is_identifier_not_first_char(*((unsigned char*)current)));

            l->token_type = TOKEN_IDENTIFIER;
            break;
        }
		case '0' ... '9':
		do {
			current++;
		} while (*current >= '0' && *current <= '9');
        l->token_type = TOKEN_NUMBER;
		break;
        default:
        l->token_type = TOKEN_INVALID;
        abort();
        break;
    }

    l->token_start = start;
    l->token_end = current;
    l->current = current;
}

int main() {
    const char* apple = "int main(int argc, char* argv[]) {\n"
                        "\n"
                        "\tint x = 16;\n"
                        "\t\n"
                        "\t\tx *= 64;\n"
                        "\tx %= 16;\n"
                        "\t\n"
                        "\treturn 0;\n"
                        "}\n";

    Lexer l = { apple };
    do {
        read(&l);
        printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start);
    } while (l.token_type);

    return 0;
}
	#include <stdlib.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <stdbool.h>
	#include <string.h>
	#include <x86intrin.h>

	enum {
	TOKEN_ACCESSOR = '.',

	TOKEN_PLUS = '+',
	TOKEN_MINUS = '-',
	TOKEN_TIMES = '*',
	TOKEN_SLASH = '/',
	TOKEN_PERCENT = '%',
	TOKEN_ASSIGN = '=',

	TOKEN_HASH = '#',

	TOKEN_BRACKET_OPEN = '[',
	TOKEN_BRACKET_CLOSE = ']',

	TOKEN_PAREN_OPEN = '(',
	TOKEN_PAREN_CLOSE = ')',

	TOKEN_BRACE_OPEN = '{',
	TOKEN_BRACE_CLOSE = '}',

	TOKEN_IDENTIFIER = 256,
	TOKEN_NUMBER,
	TOKEN_STRING,

	TOKEN_INVALID,

	TOKEN_INCREMENT, /* ++ */
	TOKEN_DECREMENT, /* -- */

	TOKEN_ARROW, /* -> */

	TOKEN_DOUBLE_HASH = '#' + 256, /* ## */
	TOKEN_PLUS_EQUAL = '+' + 256, /* += */
	TOKEN_MINUS_EQUAL = '-' + 256, /* -= */
	TOKEN_TIMES_EQUAL = '' + 256, / = /
	TOKEN_SLASH_EQUAL = '/' + 256, /* /= */
	TOKEN_PERCENT_EQUAL = '%' + 256, /* /= */
	TOKEN_EQUALITY = '=' + 256, /* == */
	};

	typedef struct Lexer {
	const char* current;

	// current token info
	int token_type;
	const char* token_start;
	const char* token_end;
	} Lexer;

	static _Alignas(64) uint8_t identifier_char_tbl[] = {
	0x00,0x00,0x00,0x00,
	0x10,0x00,0xff,0x03,
	0xfe,0xff,0xff,0x87,
	0xfe,0xff,0xff,0x07,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00
	};

	__attribute__((always_inline))
	static bool is_identifier_not_first_char(char ch) {
	size_t i = ch;
	size_t index = i / 8;
	size_t shift = i & 7;
	return (identifier_char_tbl[index] >> shift) & 1;
	}

	static _Alignas(64) uint8_t space_char_tbl[] = {
	0x00,0x26,0x00,0x00,
	0x01,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00,
	0x00,0x00,0x00,0x00
	};

	__attribute__((always_inline))
	static bool is_space(char ch) {
	size_t i = ch;
	size_t index = i / 8;
	size_t shift = i & 7;
	return (space_char_tbl[index] >> shift) & 1;
	}

	static void read(Lexer* restrict l) {
	const char* current = l->current;

	// branchless space skip
	current += (*current == ' ');

	redo_lex:;
	const char* start = current;
	switch (*start) {
	case '\0':
	l->token_type = '\0';
	break;
	case '\r':
	current++;

	// it's expected these are next to each other because
	// Windows, fast path a fallthrough
	if (*current != '\n') goto redo_lex;
	case '\n': {
	current++;

	// Do a branchless SIMD skip of up to 16 tabs after a newline.
	__m128i chars = _mm_loadu_si128((__m128i *)current);
	int len = __builtin_ffs(~_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\t'))));
	current += (len ? len - 1 : 0);
	goto redo_lex;
	}
	case ' ':
	case '\t':
	case '\v':
	// slow path
	do {
	current++;
	} while (is_space(*current));
	goto redo_lex;
	case '\"':
	current++;

	do {
	__m128i chars = _mm_loadu_si128((__m128i *)current);
	int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"'))));

	if (len) {
	current += len;
	if (current[-1] == '\"' && current[-2] != '\\') break;
	} else {
	current += 16;
	}
	} while (*current);

	l->token_type = TOKEN_STRING;
	break;
	case '+':
	current++;

	if (*current == '-') {
	current++;
	l->token_type = TOKEN_DECREMENT;
	break;
	} else if (*current == '=') {
	current++;
	l->token_type = TOKEN_PLUS_EQUAL;
	break;
	}

	l->token_type = TOKEN_PLUS;
	break;
	case '-':
	current++;

	if (*current == '-') {
	current++;
	l->token_type = TOKEN_DECREMENT;
	break;
	} else if (*current == '>') {
	current++;
	l->token_type = TOKEN_ARROW;
	break;
	}

	l->token_type = TOKEN_MINUS;
	break;
	case '*':
	case '/':
	case '&':
	case '%':
	case '\|':
	case '^':
	case '=': {
	int t = *current++;
	if (*current == '=') {
	current++;
	t += 256;
	}

	l->token_type = t;
	break;
	}
	case '#':
	current++;

	if (*current == '#') {
	current++;
	l->token_type = TOKEN_DOUBLE_HASH;
	break;
	}

	l->token_type = TOKEN_HASH;
	goto redo_lex;
	case '~':
	case '!':
	case '.':
	case ';':
	case ',':
	case '(':
	case ')':
	case '[':
	case ']':
	case '{':
	case '}':
	current++;
	l->token_type = *start;
	break;
	case 'A' ... 'Z':
	case 'a' ... 'z':
	case '_': case '$': {
	__builtin_prefetch(identifier_char_tbl);
	do {
	current++;
	} while (is_identifier_not_first_char(((unsigned char)current)));

	l->token_type = TOKEN_IDENTIFIER;
	break;
	}
	case '0' ... '9':
	do {
	current++;
	} while (current >= '0' && current <= '9');
	l->token_type = TOKEN_NUMBER;
	break;
	default:
	l->token_type = TOKEN_INVALID;
	abort();
	break;
	}

	l->token_start = start;
	l->token_end = current;
	l->current = current;
	}

	int main() {
	const char* apple = "int main(int argc, char* argv[]) {\n"
	"\n"
	"\tint x = 16;\n"
	"\t\n"
	"\t\tx *= 64;\n"
	"\tx %= 16;\n"
	"\t\n"
	"\treturn 0;\n"
	"}\n";

	Lexer l = { apple };
	do {
	read(&l);
	printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start);
	} while (l.token_type);

	return 0;
	}