Skip to content

Instantly share code, notes, and snippets.

@RealNeGate
Last active November 24, 2021 04:36
Show Gist options
  • Save RealNeGate/f09124dbe0179bbac5371e3d170a6d44 to your computer and use it in GitHub Desktop.
Save RealNeGate/f09124dbe0179bbac5371e3d170a6d44 to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <x86intrin.h>
enum {
TOKEN_ACCESSOR = '.',
TOKEN_PLUS = '+',
TOKEN_MINUS = '-',
TOKEN_TIMES = '*',
TOKEN_SLASH = '/',
TOKEN_PERCENT = '%',
TOKEN_ASSIGN = '=',
TOKEN_HASH = '#',
TOKEN_BRACKET_OPEN = '[',
TOKEN_BRACKET_CLOSE = ']',
TOKEN_PAREN_OPEN = '(',
TOKEN_PAREN_CLOSE = ')',
TOKEN_BRACE_OPEN = '{',
TOKEN_BRACE_CLOSE = '}',
TOKEN_IDENTIFIER = 256,
TOKEN_NUMBER,
TOKEN_STRING,
TOKEN_INVALID,
TOKEN_INCREMENT, /* ++ */
TOKEN_DECREMENT, /* -- */
TOKEN_ARROW, /* -> */
TOKEN_DOUBLE_HASH = '#' + 256, /* ## */
TOKEN_PLUS_EQUAL = '+' + 256, /* += */
TOKEN_MINUS_EQUAL = '-' + 256, /* -= */
TOKEN_TIMES_EQUAL = '*' + 256, /* *= */
TOKEN_SLASH_EQUAL = '/' + 256, /* /= */
TOKEN_PERCENT_EQUAL = '%' + 256, /* /= */
TOKEN_EQUALITY = '=' + 256, /* == */
};
typedef struct Lexer {
const char* current;
// current token info
int token_type;
const char* token_start;
const char* token_end;
} Lexer;
static _Alignas(64) uint8_t identifier_char_tbl[] = {
0x00,0x00,0x00,0x00,
0x10,0x00,0xff,0x03,
0xfe,0xff,0xff,0x87,
0xfe,0xff,0xff,0x07,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00
};
__attribute__((always_inline))
static bool is_identifier_not_first_char(char ch) {
size_t i = ch;
size_t index = i / 8;
size_t shift = i & 7;
return (identifier_char_tbl[index] >> shift) & 1;
}
static _Alignas(64) uint8_t space_char_tbl[] = {
0x00,0x26,0x00,0x00,
0x01,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00
};
__attribute__((always_inline))
static bool is_space(char ch) {
size_t i = ch;
size_t index = i / 8;
size_t shift = i & 7;
return (space_char_tbl[index] >> shift) & 1;
}
static void read(Lexer* restrict l) {
const char* current = l->current;
// branchless space skip
current += (*current == ' ');
redo_lex:;
const char* start = current;
switch (*start) {
case '\0':
l->token_type = '\0';
break;
case '\r':
current++;
// it's expected these are next to each other because
// Windows, fast path a fallthrough
if (*current != '\n') goto redo_lex;
case '\n': {
current++;
// Do a branchless SIMD skip of up to 16 tabs after a newline.
__m128i chars = _mm_loadu_si128((__m128i *)current);
int len = __builtin_ffs(~_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\t'))));
current += (len ? len - 1 : 0);
goto redo_lex;
}
case ' ':
case '\t':
case '\v':
// slow path
do {
current++;
} while (is_space(*current));
goto redo_lex;
case '\"':
current++;
do {
__m128i chars = _mm_loadu_si128((__m128i *)current);
int len = __builtin_ffs(_mm_movemask_epi8(_mm_cmpeq_epi8(chars, _mm_set1_epi8('\"'))));
if (len) {
current += len;
if (current[-1] == '\"' && current[-2] != '\\') break;
} else {
current += 16;
}
} while (*current);
l->token_type = TOKEN_STRING;
break;
case '+':
current++;
if (*current == '-') {
current++;
l->token_type = TOKEN_DECREMENT;
break;
} else if (*current == '=') {
current++;
l->token_type = TOKEN_PLUS_EQUAL;
break;
}
l->token_type = TOKEN_PLUS;
break;
case '-':
current++;
if (*current == '-') {
current++;
l->token_type = TOKEN_DECREMENT;
break;
} else if (*current == '>') {
current++;
l->token_type = TOKEN_ARROW;
break;
}
l->token_type = TOKEN_MINUS;
break;
case '*':
case '/':
case '&':
case '%':
case '|':
case '^':
case '=': {
int t = *current++;
if (*current == '=') {
current++;
t += 256;
}
l->token_type = t;
break;
}
case '#':
current++;
if (*current == '#') {
current++;
l->token_type = TOKEN_DOUBLE_HASH;
break;
}
l->token_type = TOKEN_HASH;
goto redo_lex;
case '~':
case '!':
case '.':
case ';':
case ',':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
current++;
l->token_type = *start;
break;
case 'A' ... 'Z':
case 'a' ... 'z':
case '_': case '$': {
__builtin_prefetch(identifier_char_tbl);
do {
current++;
} while (is_identifier_not_first_char(*((unsigned char*)current)));
l->token_type = TOKEN_IDENTIFIER;
break;
}
case '0' ... '9':
do {
current++;
} while (*current >= '0' && *current <= '9');
l->token_type = TOKEN_NUMBER;
break;
default:
l->token_type = TOKEN_INVALID;
abort();
break;
}
l->token_start = start;
l->token_end = current;
l->current = current;
}
int main() {
const char* apple = "int main(int argc, char* argv[]) {\n"
"\n"
"\tint x = 16;\n"
"\t\n"
"\t\tx *= 64;\n"
"\tx %= 16;\n"
"\t\n"
"\treturn 0;\n"
"}\n";
Lexer l = { apple };
do {
read(&l);
printf("%.*s\n", (int)(l.token_end - l.token_start), l.token_start);
} while (l.token_type);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment