Created
April 21, 2023 09:32
-
-
Save OliveIsAWord/13aab4654274b20818f1733291b69882 to your computer and use it in GitHub Desktop.
An approach to parser combinators in C to lex source code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <string.h> | |
bool is_whitespace(char c) { | |
return c == ' ' || c == '\n' || c == '\r' || c == '\t'; | |
} | |
bool is_digit(char c) { | |
return '0' <= c && c <= '9'; | |
} | |
bool is_ident_start(char c) { | |
return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_'; | |
} | |
bool is_ident_continue(char c) { | |
return is_ident_start(c) || is_digit(c); | |
} | |
typedef enum { | |
// a special token that signals the current lexer has failed, but others might succeed | |
TOKEN_FAIL = 0, | |
// the string "int" | |
KEYWORD_INT, | |
// an integer literal, such as "42" | |
LITERAL_INTEGER, | |
// an identifier, such as "tribble_count" | |
TOKEN_IDENT, | |
// the end of the source code | |
TOKEN_EOF = -1, | |
// an error in the source code | |
TOKEN_ERROR = -2, | |
} TokenKind; | |
typedef struct { | |
// string of characters, not assumed to be null-terminated | |
const char* src; | |
// length of string, not including any terminating null byte | |
// user assumes that `src[0..len]` (exclusive) contains no null bytes | |
uintptr_t len; | |
} Span; | |
// assumes `src` is null-terminated | |
Span span_from_str(char* src) { | |
uintptr_t len = strlen(src); | |
Span span = {.src = src, .len = len}; | |
return span; | |
} | |
// assumes `span.len >= n` | |
Span span_increment_by(Span span, uintptr_t n) { | |
Span new_span = {.src = span.src + n, .len = span.len - n}; | |
return new_span; | |
} | |
// assumes `span` is non-empty | |
Span span_increment(Span span) { | |
return span_increment_by(span, 1); | |
} | |
// assumes `span.src` is null-terminated | |
Span span_skip_whitespace(Span span) { | |
while (is_whitespace(*span.src)) span = span_increment(span); | |
return span; | |
} | |
typedef struct { | |
TokenKind kind; | |
// if `kind == TOKEN_FAIL`, `span` is assumed to be uninitialized | |
Span span; | |
} Token; | |
Token lex_ident(Span span) { | |
Token fail = {.kind = TOKEN_FAIL}; | |
if (!is_ident_start(*span.src)) return fail; | |
Span ident_span = {.src = span.src, .len = 0}; | |
do { | |
span = span_increment(span); | |
ident_span.len++; | |
} while (is_ident_continue(*span.src)); | |
Token ident = {.kind = TOKEN_IDENT, .span = ident_span}; | |
return ident; | |
} | |
Token lex_integer(Span span) { | |
Token fail = {.kind = TOKEN_FAIL}; | |
Span int_span = {.src = span.src, .len = 0}; | |
if (*span.src == '-') { | |
int_span.len++; | |
span = span_increment(span); | |
} | |
if (!is_digit(*span.src)) return fail; | |
do { | |
span = span_increment(span); | |
int_span.len++; | |
} while (is_digit(*span.src)); | |
Token integer = {.kind = LITERAL_INTEGER, .span = int_span}; | |
return integer; | |
} | |
Token lex_keyword(Token keyword, Span span) { | |
Token fail = {.kind = TOKEN_FAIL}; | |
Token ident = lex_ident(span); | |
if (ident.kind == TOKEN_FAIL) return fail; | |
if (strncmp(keyword.span.src, ident.span.src, keyword.span.len)) { | |
return fail; | |
} | |
ident.kind = keyword.kind; | |
return ident; | |
} | |
Token lex_keyword_int(Span span) { | |
Span keyword_int_span = span_from_str("int"); | |
Token keyword_int = {.kind = KEYWORD_INT, .span = keyword_int_span}; | |
return lex_keyword(keyword_int, span); | |
} | |
// Infallibly produces a TOKEN_ERROR token of length 1 | |
Token lex_error(Span span) { | |
Span error_span = {.src = span.src, .len=1}; | |
Token error_token = {.kind = TOKEN_ERROR, .span = error_span}; | |
return error_token; | |
} | |
typedef struct { | |
Token token; | |
Span rest; | |
} LexerOutput; | |
// assumes `span` is null terminated | |
LexerOutput lex_next_token(Span span) { | |
span = span_skip_whitespace(span); | |
if (*span.src == '\0') { | |
assert(span.len == 0); | |
Token eof_token = {.kind = TOKEN_EOF, .span = span}; | |
LexerOutput eof = {.token = eof_token, .rest = span}; | |
return eof; | |
} | |
assert(span.len > 0); | |
// lexers assume `span.src` is non-empty and null-terminated | |
Token (*lexers[4])(Span) = { | |
lex_integer, | |
lex_keyword_int, | |
lex_ident, | |
lex_error, | |
}; | |
// assumes at least one lexer succeeds | |
for (int i = 0;; i++) { | |
Token token = (*lexers[i])(span); | |
if (token.kind == TOKEN_FAIL) continue; | |
assert(token.span.len > 0); | |
Span rest = span_increment_by(span, token.span.len); | |
LexerOutput out = {.token = token, .rest = rest}; | |
return out; | |
} | |
} | |
int main(void) { | |
char* src = "Hello, W0rld \n int 42"; | |
Span span = span_from_str(src); | |
Token token; | |
do { | |
LexerOutput out = lex_next_token(span); | |
token = out.token; | |
span = out.rest; | |
printf("%i \"", token.kind); | |
printf("%.*s\"\n", (int)token.span.len, token.span.src); | |
} while (token.kind != TOKEN_EOF); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment