Skip to content

Instantly share code, notes, and snippets.

@OliveIsAWord
Created April 21, 2023 09:32
Show Gist options
  • Save OliveIsAWord/13aab4654274b20818f1733291b69882 to your computer and use it in GitHub Desktop.
Save OliveIsAWord/13aab4654274b20818f1733291b69882 to your computer and use it in GitHub Desktop.
An approach to parser combinators in C to lex source code
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
bool is_whitespace(char c) {
return c == ' ' || c == '\n' || c == '\r' || c == '\t';
}
bool is_digit(char c) {
return '0' <= c && c <= '9';
}
bool is_ident_start(char c) {
return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_';
}
bool is_ident_continue(char c) {
return is_ident_start(c) || is_digit(c);
}
typedef enum {
// a special token that signals the current lexer has failed, but others might succeed
TOKEN_FAIL = 0,
// the string "int"
KEYWORD_INT,
// an integer literal, such as "42"
LITERAL_INTEGER,
// an identifier, such as "tribble_count"
TOKEN_IDENT,
// the end of the source code
TOKEN_EOF = -1,
// an error in the source code
TOKEN_ERROR = -2,
} TokenKind;
typedef struct {
// string of characters, not assumed to be null-terminated
const char* src;
// length of string, not including any terminating null byte
// user assumes that `src[0..len]` (exclusive) contains no null bytes
uintptr_t len;
} Span;
// assumes `src` is null-terminated
Span span_from_str(char* src) {
uintptr_t len = strlen(src);
Span span = {.src = src, .len = len};
return span;
}
// assumes `span.len >= n`
Span span_increment_by(Span span, uintptr_t n) {
Span new_span = {.src = span.src + n, .len = span.len - n};
return new_span;
}
// assumes `span` is non-empty
Span span_increment(Span span) {
return span_increment_by(span, 1);
}
// assumes `span.src` is null-terminated
Span span_skip_whitespace(Span span) {
while (is_whitespace(*span.src)) span = span_increment(span);
return span;
}
typedef struct {
TokenKind kind;
// if `kind == TOKEN_FAIL`, `span` is assumed to be uninitialized
Span span;
} Token;
Token lex_ident(Span span) {
Token fail = {.kind = TOKEN_FAIL};
if (!is_ident_start(*span.src)) return fail;
Span ident_span = {.src = span.src, .len = 0};
do {
span = span_increment(span);
ident_span.len++;
} while (is_ident_continue(*span.src));
Token ident = {.kind = TOKEN_IDENT, .span = ident_span};
return ident;
}
Token lex_integer(Span span) {
Token fail = {.kind = TOKEN_FAIL};
Span int_span = {.src = span.src, .len = 0};
if (*span.src == '-') {
int_span.len++;
span = span_increment(span);
}
if (!is_digit(*span.src)) return fail;
do {
span = span_increment(span);
int_span.len++;
} while (is_digit(*span.src));
Token integer = {.kind = LITERAL_INTEGER, .span = int_span};
return integer;
}
Token lex_keyword(Token keyword, Span span) {
Token fail = {.kind = TOKEN_FAIL};
Token ident = lex_ident(span);
if (ident.kind == TOKEN_FAIL) return fail;
if (strncmp(keyword.span.src, ident.span.src, keyword.span.len)) {
return fail;
}
ident.kind = keyword.kind;
return ident;
}
Token lex_keyword_int(Span span) {
Span keyword_int_span = span_from_str("int");
Token keyword_int = {.kind = KEYWORD_INT, .span = keyword_int_span};
return lex_keyword(keyword_int, span);
}
// Infallibly produces a TOKEN_ERROR token of length 1
Token lex_error(Span span) {
Span error_span = {.src = span.src, .len=1};
Token error_token = {.kind = TOKEN_ERROR, .span = error_span};
return error_token;
}
typedef struct {
Token token;
Span rest;
} LexerOutput;
// assumes `span` is null terminated
LexerOutput lex_next_token(Span span) {
span = span_skip_whitespace(span);
if (*span.src == '\0') {
assert(span.len == 0);
Token eof_token = {.kind = TOKEN_EOF, .span = span};
LexerOutput eof = {.token = eof_token, .rest = span};
return eof;
}
assert(span.len > 0);
// lexers assume `span.src` is non-empty and null-terminated
Token (*lexers[4])(Span) = {
lex_integer,
lex_keyword_int,
lex_ident,
lex_error,
};
// assumes at least one lexer succeeds
for (int i = 0;; i++) {
Token token = (*lexers[i])(span);
if (token.kind == TOKEN_FAIL) continue;
assert(token.span.len > 0);
Span rest = span_increment_by(span, token.span.len);
LexerOutput out = {.token = token, .rest = rest};
return out;
}
}
int main(void) {
char* src = "Hello, W0rld \n int 42";
Span span = span_from_str(src);
Token token;
do {
LexerOutput out = lex_next_token(span);
token = out.token;
span = out.rest;
printf("%i \"", token.kind);
printf("%.*s\"\n", (int)token.span.len, token.span.src);
} while (token.kind != TOKEN_EOF);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment