Skip to content

Instantly share code, notes, and snippets.

@gingerBill
Last active November 5, 2017 11:57
Show Gist options
  • Save gingerBill/da32316b488950491189364e97198cd8 to your computer and use it in GitHub Desktop.
Save gingerBill/da32316b488950491189364e97198cd8 to your computer and use it in GitHub Desktop.
Incomplete tokenizer for WIP language
// NOTE(bill): Requires gb.h v0.25a
// TODO(bill): Unicode support
b32 rune_is_letter(Rune r) {
if (gb_char_is_alpha(cast(char)r) || r == '_') {
return true;
}
return false;
}
b32 rune_is_digit(Rune r) {
if (gb_is_between(r, '0', '9'))
return true;
return false;
}
b32 rune_is_whitespace(Rune r) {
switch (r) {
case ' ':
case '\t':
case '\n':
case '\r':
case '\f':
case '\v':
return true;
}
return false;
}
enum TokenType {
Token_Invalid,
Token_EOF,
Token__LiteralBegin,
Token_Identifier,
Token_Integer,
Token_Float,
Token_Rune,
Token_String,
Token__LiteralEnd,
Token__OperatorBegin,
Token_Eq, // =
Token_Not, // ! (Unary Boolean)
Token_Hash, // #
Token_Tilde, // ~ // TODO(bill): Remove
Token_Add, // +
Token_Sub, // -
Token_Mul, // *
Token_Quo, // /
Token_Mod, // %
Token_AddAssign, // +=
Token_SubAssign, // -=
Token_MulAssign, // *=
Token_QuoAssign, // /=
Token_ModAssign, // %=
Token_And, // &
Token_Or, // |
Token_Xor, // ^
Token_AndNot, // &^
Token_AndAssign, // &=
Token_OrAssign, // |=
Token_XorAssign, // ^=
Token_AndNotAssign, // &^=
Token_Increment, // ++
Token_Decrement, // --
Token_CmpAnd, // &&
Token_CmpOr, // ||
Token_CmpEq, // ==
Token_Lt, // <
Token_Gt, // >
Token_NotEq, // !=
Token_LtEq, // <=
Token_GtEq, // >=
Token_OpenParen, // (
Token_CloseParen, // )
Token_OpenBracket, // [
Token_CloseBracket, // ]
Token_OpenBrace, // {
Token_CloseBrace, // }
Token_Colon, // :
Token_Semicolon, // ;
Token_Period, // .
Token_Comma, // ,
Token_Ellipsis, // ...
Token__OperatorEnd,
Token__KeywordBegin,
Token_switch, // TODO(bill): Or match?
Token_break,
Token_continue,
Token_fallthrough,
Token_case,
Token_default,
Token_if,
Token_else,
Token_for,
Token_proc,
Token_defer,
Token_return,
Token_import,
Token_struct,
Token_union,
Token_enum,
Token_type,
Token_inline,
Token_no_inline,
Token__KeywordEnd,
Token_Count,
};
char const *TOKEN_STRINGS[] = {
"Invalid",
"EOF",
"_LiteralBegin",
"Identifier",
"Integer",
"Float",
"Rune",
"String",
"_LiteralEnd",
"_OperatorBegin",
"=",
"!",
"#",
"~",
"+",
"-",
"*",
"/",
"%",
"+=",
"-=",
"*=",
"/=",
"%=",
"&",
"|",
"^",
"&^",
"&=",
"|=",
"^=",
"&^=",
"++",
"--",
"&&",
"||",
"==",
"<",
">",
"!=",
"<=",
">=",
"(",
")",
"[",
"]",
"{",
"}",
":",
";",
".",
",",
"...",
"_OperatorEnd",
"_KeywordBegin",
"switch",
"break",
"continue",
"fallthrough",
"case",
"default",
"if",
"else",
"for",
"proc",
"defer",
"return",
"import",
"struct",
"union",
"enum",
"type",
"inline",
"no_inline",
"_KeywordEnd",
};
// NOTE(bill): Text is UTF-8, thus why u8 and not char
struct Token {
TokenType type;
u8 *text; isize len;
isize line, column;
};
#define TOK_LIT(token_) (token_).len, (token_).text
char const *token_type_to_string(TokenType type) {
return TOKEN_STRINGS[type];
}
enum {
LowestPrec = 0,
UnaryPrec = 6,
HighestPrec = 7,
};
int token_precedence(Token t) {
switch (t.type) {
case Token_CmpOr: return 1;
case Token_CmpAnd: return 2;
case Token_CmpEq:
case Token_NotEq:
case Token_Lt:
case Token_Gt:
case Token_LtEq:
case Token_GtEq:
return 3;
case Token_Add:
case Token_Sub:
case Token_Or:
case Token_Xor:
return 4;
case Token_Mul:
case Token_Quo:
case Token_Mod:
case Token_And:
case Token_AndNot:
return 5;
}
return LowestPrec;
}
gb_inline b32 token_is_literal(Token t) {
return gb_is_between(t.type, Token__LiteralBegin+1, Token__LiteralEnd-1);
}
gb_inline b32 token_is_operator(Token t) {
return gb_is_between(t.type, Token__OperatorBegin+1, Token__OperatorEnd-1);
}
gb_inline b32 token_is_keyword(Token t) {
return gb_is_between(t.type, Token__KeywordBegin+1, Token__KeywordEnd-1);
}
struct Tokenizer {
char *fullpath;
u8 *start;
u8 *end;
Rune curr_rune; // current character
u8 * curr; // character pos
u8 * read_curr; // pos from start
u8 * line; // current line pos
isize line_count;
};
#define tokenizer_error(t, msg, ...) tokenizer_error_(t, __FUNCTION__, msg, ##__VA_ARGS__)
void tokenizer_error_(Tokenizer *t, char *function, char *msg, ...) {
va_list va;
isize column = t->read_curr - t->line+1;
if (column < 1)
column = 1;
gb_printf_err("%s()\n", function);
gb_printf_err("%s(%td:%td) ", t->fullpath, t->line_count, column);
va_start(va, msg);
gb_printf_err_va(msg, va);
va_end(va);
gb_printf_err("\n");
gb_exit(1);
}
void advance_to_next_rune(Tokenizer *t) {
if (t->read_curr < t->end) {
Rune rune;
isize width = 1;
t->curr = t->read_curr;
if (t->curr_rune == '\n') {
t->line = t->curr;
t->line_count++;
}
rune = *t->read_curr;
if (rune == 0) {
tokenizer_error(t, "Illegal character NUL");
} else if (rune >= 0x80) { // not ASCII
width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
if (rune == GB_RUNE_INVALID && width == 1)
tokenizer_error(t, "Illegal UTF-8 encoding");
else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
tokenizer_error(t, "Illegal byte order mark");
}
t->read_curr += width;
t->curr_rune = rune;
} else {
t->curr = t->end;
if (t->curr_rune == '\n') {
t->line = t->curr;
t->line_count++;
}
t->curr_rune = GB_RUNE_EOF;
}
}
b32 init_tokenizer(Tokenizer *t, char *filename) {
gbFileContents fc = gb_file_read_contents(gb_heap_allocator(), true, filename);
gb_zero_item(t);
if (fc.data) {
t->start = cast(u8 *)fc.data;
t->line = t->read_curr = t->curr = t->start;
t->end = t->start + fc.size;
t->fullpath = gb_path_get_full_name(gb_heap_allocator(), filename);
t->line_count = 1;
advance_to_next_rune(t);
if (t->curr_rune == GB_RUNE_BOM)
advance_to_next_rune(t); // Ignore BOM at file beginning
return true;
}
return false;
}
gb_inline void destroy_tokenizer(Tokenizer *t) {
gb_free(gb_heap_allocator(), t->start);
}
void tokenizer_skip_whitespace(Tokenizer *t) {
for (;;) {
if (rune_is_whitespace(t->curr_rune)) {
advance_to_next_rune(t);
} else if (t->curr_rune == '/') {
if (t->read_curr[0] == '/') { // Line comment //
while (t->curr_rune != '\n')
advance_to_next_rune(t);
} else if (t->read_curr[0] == '*') { // (Nested) Block comment /**/
isize comment_scope = 1;
for (;;) {
advance_to_next_rune(t);
if (t->curr_rune == '/') {
advance_to_next_rune(t);
if (t->curr_rune == '*') {
advance_to_next_rune(t);
comment_scope++;
}
}
if (t->curr_rune == '*') {
advance_to_next_rune(t);
if (t->curr_rune == '/') {
advance_to_next_rune(t);
comment_scope--;
}
}
if (comment_scope == 0)
break;
}
} else {
break;
}
} else {
break;
}
}
}
gb_inline i32 digit_value(Rune r) {
if (gb_char_is_digit(r))
return r - '0';
if (gb_is_between(r, 'a', 'f'))
return r - 'a' + 10;
if (gb_is_between(r, 'A', 'F'))
return r - 'A' + 10;
return 16; // NOTE(bill): Larger than highest possible
}
gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
// NOTE(bill): Allow for underscores in numbers as a number separator
// TODO(bill): Is this a good idea?
while (digit_value(t->curr_rune) < base || t->curr_rune == '_')
advance_to_next_rune(t);
}
Token scan_number_to_token(Tokenizer *t, b32 seen_decimal_point) {
Token token = {};
u8 *start_curr = t->curr;
token.type = Token_Integer;
token.text = start_curr;
token.line = t->line_count;
token.column = t->curr-t->line+1;
if (seen_decimal_point) {
start_curr--;
token.type = Token_Float;
scan_mantissa(t, 10);
goto exponent;
}
if (t->curr_rune == '0') {
u8 *prev = t->curr;
advance_to_next_rune(t);
if (t->curr_rune == 'b') { // Binary
advance_to_next_rune(t);
scan_mantissa(t, 2);
if (t->curr - prev <= 2)
token.type = Token_Invalid;
} else if (t->curr_rune == 'o') { // Octal
advance_to_next_rune(t);
scan_mantissa(t, 8);
if (t->curr - prev <= 2)
token.type = Token_Invalid;
} else if (t->curr_rune == 'd') { // Decimal
advance_to_next_rune(t);
scan_mantissa(t, 10);
if (t->curr - prev <= 2)
token.type = Token_Invalid;
} else if (t->curr_rune == 'x') { // Hexadecimal
advance_to_next_rune(t);
scan_mantissa(t, 16);
if (t->curr - prev <= 2)
token.type = Token_Invalid;
} else {
seen_decimal_point = false;
scan_mantissa(t, 10);
if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
seen_decimal_point = true;
goto fraction;
}
}
goto end;
}
scan_mantissa(t, 10);
fraction:
if (t->curr_rune == '.') {
token.type = Token_Float;
advance_to_next_rune(t);
scan_mantissa(t, 10);
}
exponent:
if (t->curr_rune == 'e' || t->curr_rune == 'E') {
token.type = Token_Float;
advance_to_next_rune(t);
if (t->curr_rune == '-' || t->curr_rune == '+')
advance_to_next_rune(t);
scan_mantissa(t, 10);
}
end:
token.len = t->curr - token.text;
return token;
}
// Quote == " for string and ' for char
b32 scan_escape(Tokenizer *t, Rune quote) {
isize len = 0;
u32 base = 0, max = 0, x = 0;
Rune r = t->curr_rune;
if (r == 'a' ||
r == 'b' ||
r == 'f' ||
r == 'n' ||
r == 'r' ||
r == 't' ||
r == 'v' ||
r == '\\' ||
r == quote) {
advance_to_next_rune(t);
return true;
} else if (gb_is_between(r, '0', '7')) {
len = 3; base = 8; max = 255;
} else if (r == 'x') {
advance_to_next_rune(t);
len = 2; base = 16; max = 255;
} else if (r == 'u') {
advance_to_next_rune(t);
len = 4; base = 16; max = GB_RUNE_MAX;
} else if (r == 'U') {
advance_to_next_rune(t);
len = 8; base = 16; max = GB_RUNE_MAX;
} else {
if (t->curr_rune < 0)
tokenizer_error(t, "Escape sequence was not terminated");
else
tokenizer_error(t, "Unknown escape sequence");
return false;
}
while (len --> 0) {
u32 d = cast(u32)digit_value(t->curr_rune);
if (d >= base) {
if (t->curr_rune < 0)
tokenizer_error(t, "Escape sequence was not terminated");
else
tokenizer_error(t, "Illegal character %d in escape sequence", t->curr_rune);
return false;
}
x = x*base + d;
advance_to_next_rune(t);
}
return true;
}
gb_inline TokenType token_type_variant2(Tokenizer *t, TokenType a, TokenType b) {
if (t->curr_rune == '=') {
advance_to_next_rune(t);
return b;
}
return a;
}
gb_inline TokenType token_type_variant3(Tokenizer *t, TokenType a, TokenType b, Rune ch_c, TokenType c) {
if (t->curr_rune == '=') {
advance_to_next_rune(t);
return b;
}
if (t->curr_rune == ch_c) {
advance_to_next_rune(t);
return c;
}
return a;
}
Token tokenizer_get_token(Tokenizer *t) {
Token token = {};
Rune curr_rune;
tokenizer_skip_whitespace(t);
token.text = t->curr;
token.len = 1;
token.line = t->line_count;
token.column = t->curr - t->line + 1;
curr_rune = t->curr_rune;
if (rune_is_letter(curr_rune)) {
token.type = Token_Identifier;
while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune))
advance_to_next_rune(t);
token.len = t->curr - token.text;
// NOTE(bill): ALL identifiers are > 1
if (token.len > 1) {
#define KWB if (0) {}
#define KWT(keyword, token_type) else if ((gb_size_of(keyword)-1) == token.len && gb_strncmp((char *)token.text, keyword, token.len) == 0) token.type = token_type
#define KWE else {}
KWB
KWT("switch", Token_switch);
KWT("break", Token_break);
KWT("continue", Token_continue);
KWT("fallthrough", Token_fallthrough);
KWT("case", Token_case);
KWT("default", Token_default);
KWT("if", Token_if);
KWT("else", Token_else);
KWT("for", Token_for);
KWT("proc", Token_proc);
KWT("defer", Token_defer);
KWT("return", Token_return);
KWT("import", Token_import);
KWT("struct", Token_struct);
KWT("union", Token_union);
KWT("enum", Token_enum);
KWT("type", Token_type);
KWT("inline", Token_inline);
KWT("no_inline", Token_no_inline);
KWE
#undef KWB
#undef KWT
#undef KWE
}
} else if (gb_is_between(curr_rune, '0', '9')) {
token = scan_number_to_token(t, false);
} else {
advance_to_next_rune(t);
switch (curr_rune) {
case GB_RUNE_EOF:
token.type = Token_EOF;
break;
case '"': // String Literal
token.type = Token_String;
for (;;) {
Rune r = t->curr_rune;
if (r == '\n' || r < 0) {
tokenizer_error(t, "String literal not terminated");
break;
}
advance_to_next_rune(t);
if (r == '"')
break;
if (r == '\\')
scan_escape(t, '"');
}
break;
case '\'': { // Rune Literal
b32 valid = true;
isize len = 0;
token.type = Token_Rune;
for (;;) {
Rune r = t->curr_rune;
if (r == '\n' || r < 0) {
if (valid)
tokenizer_error(t, "Rune literal not terminated");
break;
}
advance_to_next_rune(t);
if (r == '\'')
break;
len++;
if (r == '\\') {
if (!scan_escape(t, '\''))
valid = false;
}
}
if (valid && len != 1)
tokenizer_error(t, "Illegal rune literal");
} break;
case '.':
token.type = Token_Period; // Default
if (gb_is_between(t->curr_rune, '0', '9')) { // Might be a number
token = scan_number_to_token(t, true);
} else if (t->curr_rune == '.') { // Could be an ellipsis
advance_to_next_rune(t);
if (t->curr_rune == '.') {
advance_to_next_rune(t);
token.type = Token_Ellipsis;
}
}
break;
case '#': token.type = Token_Hash; break;
case '~': token.type = Token_Tilde; break;
case ';': token.type = Token_Semicolon; break;
case ',': token.type = Token_Period; break;
case '(': token.type = Token_OpenParen; break;
case ')': token.type = Token_CloseParen; break;
case '[': token.type = Token_OpenBracket; break;
case ']': token.type = Token_CloseBracket; break;
case '{': token.type = Token_OpenBrace; break;
case '}': token.type = Token_CloseBrace; break;
case '*': token.type = token_type_variant2(t, Token_Mul, Token_MulAssign); break;
case '/': token.type = token_type_variant2(t, Token_Quo, Token_QuoAssign); break;
case '%': token.type = token_type_variant2(t, Token_Mod, Token_ModAssign); break;
case '^': token.type = token_type_variant2(t, Token_Xor, Token_XorAssign); break;
case '<': token.type = token_type_variant2(t, Token_Lt, Token_LtEq); break;
case '>': token.type = token_type_variant2(t, Token_Gt, Token_GtEq); break;
case '=': token.type = token_type_variant2(t, Token_Eq, Token_CmpEq); break;
case '!': token.type = token_type_variant2(t, Token_Not, Token_NotEq); break;
case '+': token.type = token_type_variant3(t, Token_Add, Token_AddAssign, '+', Token_Increment); break;
case '-': token.type = token_type_variant3(t, Token_Sub, Token_SubAssign, '-', Token_Decrement); break;
case '|': token.type = token_type_variant3(t, Token_Or, Token_OrAssign, '|', Token_CmpOr); break;
case '&':
if (t->curr_rune == '^') {
advance_to_next_rune(t);
token.type = token_type_variant2(t, Token_AndNot, Token_AndNotAssign);
} else {
token.type = token_type_variant3(t, Token_And, Token_AndAssign, '&', Token_CmpAnd);
}
break;
default:
if (curr_rune != GB_RUNE_BOM)
tokenizer_error(t, "Illegal character: %c (%d) ", cast(char)curr_rune, curr_rune);
token.type = Token_Invalid;
break;
}
}
token.len = t->curr - token.text;
return token;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment