Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active April 22, 2020 15:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cellularmitosis/d50785c855508a586eb8db9e5ef3e1dc to your computer and use it in GitHub Desktop.
Save cellularmitosis/d50785c855508a586eb8db9e5ef3e1dc to your computer and use it in GitHub Desktop.
Fooling around with recursive descent in C!
test: parse
./parse
parse: parse.c
gcc -Wall -Werror -o parse parse.c
clean:
rm -f parse
.PHONY: clean
$ make
./parse
test1: 1
<EXPR>
<INT(1)>
test2: 1+2
<EXPR>
<INT(1)>
<PLUS>
<INT(2)>
test3: 1+23+7
<EXPR>
<INT(1)>
<PLUS>
<INT(23)>
<PLUS>
<INT(7)>
test4: n=1000001, elapsed time: 0.090578 seconds (11040219 tokens/sec)
$ cat /proc/cpuinfo | grep name | head -n1
model name : Intel(R) Core(TM) i3-2350M CPU @ 2.30GHz
$ make
./parse
test1: 1
<EXPR>
<INT(1)>
test2: 1+2
<EXPR>
<INT(1)>
<PLUS>
<INT(2)>
test3: 1+23+7
<EXPR>
<INT(1)>
<PLUS>
<INT(23)>
<PLUS>
<INT(7)>
test4: n=1000001, elapsed time: 0.163313 seconds (6123217 tokens/sec)
$ cat /proc/cpuinfo | grep name | head -n1
model name : Intel(R) Core(TM) i3-2350M CPU @ 2.30GHz
// Grammar 1, using "regex"-style notation:
//
// <expr> -> <int> ( <plus> <int> )*
//
// Grammar 2, using left-recursion (not compatible with recursive descent):
//
// <expr> -> <expr> <plus> <int>
// | <int>
//
// Grammar 3, using right-recursion:
//
// <expr> -> <int> <expr2>
// <expr2> -> <plus> <int> <expr2>
// | <>
#include <stdlib.h>
#include <stdbool.h>
#include <assert.h>
#include <string.h> // for memcpy()
#include <stdio.h> // for fprintf()
void pindent(uint indent) {
for (uint i = 0; i < indent; i++) {
printf(" ");
}
}
// MARK: - token_type_t
enum _token_type_t {
TOK_INT = 0,
TOK_PLUS,
token_type_t_NUM_CASES,
};
typedef enum _token_type_t token_type_t;
char* token_type_names[token_type_t_NUM_CASES] = {
"INT",
"PLUS"
};
// MARK: - token_t
struct _token_t {
token_type_t type;
size_t offset;
size_t length;
};
typedef struct _token_t token_t;
token_t* token_create(token_type_t type, size_t offset, size_t length) {
token_t* t = (token_t*)malloc(sizeof(token_t));
assert(t != NULL);
t->type = type;
t->offset = offset;
t->length = length;
return t;
}
// MARK: - tokens_cursor_t
struct _tokens_cursor_t {
token_t** tokens;
size_t index;
size_t length;
};
typedef struct _tokens_cursor_t tokens_cursor_t;
// to be run from within each tokens_cursor_t mutation function.
void _tokens_cursor_sanity_check(tokens_cursor_t* c) {
assert(c != NULL);
assert(c->tokens != NULL);
assert(!(c->index > c->length));
}
tokens_cursor_t* tokens_cursor_create(token_t** tokens, size_t length) {
tokens_cursor_t* c = (tokens_cursor_t*)malloc(sizeof(tokens_cursor_t));
assert(c != NULL);
c->tokens = tokens;
c->length = length;
c->index = 0;
_tokens_cursor_sanity_check(c);
return c;
}
bool tokens_cursor_eof(tokens_cursor_t* c) {
return c->index == c->length;
}
token_t* tokens_cursor_peek(tokens_cursor_t* c) {
if (tokens_cursor_eof(c)) {
return NULL;
} else {
return c->tokens[c->index];
}
}
bool tokens_cursor_try_consume(tokens_cursor_t* c, token_type_t type, token_t** token_out) {
token_t* token = tokens_cursor_peek(c);
if (token == NULL || token->type != type) {
return false;
}
c->index += 1;
_tokens_cursor_sanity_check(c);
*token_out = token;
return true;
}
void tokens_cursor_save(tokens_cursor_t* c, tokens_cursor_t* savepoint) {
memcpy(savepoint, c, sizeof(tokens_cursor_t));
}
void tokens_cursor_restore(tokens_cursor_t* c, tokens_cursor_t* savepoint) {
memcpy(c, savepoint, sizeof(tokens_cursor_t));
}
// MARK: - parse_tree_type_t
enum _parse_tree_type_t {
PTREE_EXPR = 0,
PTREE_INT,
PTREE_PLUS,
parse_tree_type_t_NUM_CASES,
};
typedef enum _parse_tree_type_t parse_tree_type_t;
char* parse_tree_type_names[parse_tree_type_t_NUM_CASES] = {
"EXPR",
"INT",
"PLUS"
};
// MARK: - parse_tree_t
struct _parse_tree_list_t {
struct _parse_tree_t* tree;
struct _parse_tree_list_t* next;
};
typedef struct _parse_tree_list_t parse_tree_list_t;
struct _parse_tree_t {
parse_tree_type_t type;
token_t* token;
struct _parse_tree_list_t* children;
struct _parse_tree_list_t* _tail;
};
typedef struct _parse_tree_t parse_tree_t;
parse_tree_list_t* parse_tree_list_create(parse_tree_t* tree) {
parse_tree_list_t* child = (parse_tree_list_t*)malloc(sizeof(parse_tree_list_t));
assert(child != NULL);
child->tree = tree;
child->next = NULL;
return child;
}
parse_tree_list_t* ptl_pool = NULL;
int ptl_pool_batch_size = 256;
int ptl_pool_count = 0;
int ptl_pool_next = 0;
parse_tree_list_t* parse_tree_list_create2(parse_tree_t* tree) {
if (ptl_pool_next == ptl_pool_count) {
ptl_pool = (parse_tree_list_t*)malloc(ptl_pool_batch_size * sizeof(parse_tree_list_t));
ptl_pool_count = ptl_pool_batch_size;
ptl_pool_next = 0;
}
parse_tree_list_t* child = ptl_pool + ptl_pool_next;
ptl_pool_next++;
assert(child != NULL);
child->tree = tree;
child->next = NULL;
return child;
}
parse_tree_t* parse_tree_create(parse_tree_type_t type, token_t* token) {
parse_tree_t* t = (parse_tree_t*)malloc(sizeof(parse_tree_t));
assert(t != NULL);
t->type = type;
t->token = token;
t->children = NULL;
return t;
}
parse_tree_t* pt_pool = NULL;
int pt_pool_batch_size = 256;
int pt_pool_count = 0;
int pt_pool_next = 0;
parse_tree_t* parse_tree_create2(parse_tree_type_t type, token_t* token) {
if (pt_pool_next == pt_pool_count) {
pt_pool = (parse_tree_t*)malloc(pt_pool_batch_size * sizeof(parse_tree_t));
pt_pool_count = pt_pool_batch_size;
pt_pool_next = 0;
}
parse_tree_t* t = pt_pool + pt_pool_next;
pt_pool_next++;
assert(t != NULL);
t->type = type;
t->token = token;
t->children = NULL;
return t;
}
bool parse_tree_append(parse_tree_t* tree, parse_tree_type_t type, token_t* token) {
parse_tree_t* tree_node = parse_tree_create2(type, token);
parse_tree_list_t* list_node = parse_tree_list_create2(tree_node);
if (tree->children == NULL) {
tree->children = list_node;
} else {
tree->_tail->next = list_node;
}
tree->_tail = list_node;
return true;
}
void parse_tree_free(parse_tree_t** tree) {
// FIXME this needs to be reimplemented to handle pooled memory allocation
// parse_tree_list_t* list_node = (*tree)->children;
// parse_tree_list_t* next_node = NULL;
// while(list_node != NULL) {
// next_node = list_node->next;
// parse_tree_free(&(list_node->tree));
// free(list_node);
// list_node = next_node;
// }
// free(*tree);
// *tree = NULL;
}
// MARK: - recursive-descent parser
bool _parse_terminal(tokens_cursor_t* c, parse_tree_t* t, token_type_t tk_type, parse_tree_type_t pt_type) {
token_t* token = NULL;
return tokens_cursor_try_consume(c, tk_type, &token)
&& parse_tree_append(t, pt_type, token);
}
bool _parse_plus(tokens_cursor_t* c, parse_tree_t* t) {
return _parse_terminal(c, t, TOK_PLUS, PTREE_PLUS);
}
bool _parse_int(tokens_cursor_t* c, parse_tree_t* t) {
return _parse_terminal(c, t, TOK_INT, PTREE_INT);
}
// <expr> -> <int> ( <plus> <int> )*
bool _parse_expr(tokens_cursor_t* c, parse_tree_t** t_out) {
parse_tree_t* ptree = parse_tree_create2(PTREE_EXPR, NULL);
if (!_parse_int(c, ptree)) {
return false;
}
tokens_cursor_t savepoint;
while (true) {
tokens_cursor_save(c, &savepoint);
if (!(_parse_plus(c, ptree) && _parse_int(c, ptree))) {
tokens_cursor_restore(c, &savepoint);
break;
}
}
*t_out = ptree;
return true;
}
bool parse(tokens_cursor_t* c, parse_tree_t** tree_out) {
return _parse_expr(c, tree_out);
}
// MARK: - Parse tree printing
void _parse_tree_print(parse_tree_t* tree, char* input, uint indent) {
parse_tree_list_t* children_list = NULL;
switch(tree->type) {
case PTREE_EXPR:
pindent(indent);
printf("<%s>\n", parse_tree_type_names[tree->type]);
children_list = tree->children;
while(children_list != NULL) {
_parse_tree_print(children_list->tree, input, indent + 1);
children_list = children_list->next;
}
break;
case PTREE_INT:
pindent(indent);
printf("<%s(", parse_tree_type_names[tree->type]);
char buf[128];
assert(tree->token->length < sizeof(buf));
snprintf(buf, tree->token->length + 1, input + tree->token->offset);
printf("%s", buf);
printf(")>\n");
break;
case PTREE_PLUS:
pindent(indent);
printf("<%s>\n", parse_tree_type_names[tree->type]);
break;
default:
assert(false);
break;
}
}
void parse_tree_print(parse_tree_t* tree, char* input) {
_parse_tree_print(tree, input, 0);
}
// MARK: - main
void test1() {
char* input = "1";
printf("\ntest1: %s\n", input);
token_t t1 = {TOK_INT,0,1};
token_t* tokens[1] = {&t1};
tokens_cursor_t cursor;
cursor.tokens = tokens;
cursor.index = 0;
cursor.length = 1;
parse_tree_t* tree = NULL;
assert(parse(&cursor, &tree));
parse_tree_print(tree, input);
parse_tree_free(&tree);
}
void test2() {
char* input = "1+2";
printf("\ntest2: %s\n", input);
token_t t1 = {TOK_INT, 0,1};
token_t t2 = {TOK_PLUS,1,1};
token_t t3 = {TOK_INT, 2,1};
token_t* tokens[3] = {&t1,&t2,&t3};
tokens_cursor_t cursor;
cursor.tokens = tokens;
cursor.index = 0;
cursor.length = 3;
parse_tree_t* tree = NULL;
assert(parse(&cursor, &tree));
parse_tree_print(tree, input);
parse_tree_free(&tree);
}
void test3() {
char* input = "1+23+7";
printf("\ntest3: %s\n", input);
token_t t1 = {TOK_INT, 0,1};
token_t t2 = {TOK_PLUS,1,1};
token_t t3 = {TOK_INT, 2,2};
token_t t4 = {TOK_PLUS,4,1};
token_t t5 = {TOK_INT, 5,1};
token_t* tokens[5] = {&t1,&t2,&t3,&t4,&t5};
tokens_cursor_t cursor;
cursor.tokens = tokens;
cursor.index = 0;
cursor.length = 5;
parse_tree_t* tree = NULL;
assert(parse(&cursor, &tree));
parse_tree_print(tree, input);
parse_tree_free(&tree);
}
#include <sys/time.h> // for gettimeofdat()
void test4() {
uint num_tokens = 1000001;
token_t* tokens[num_tokens];
tokens[0] = token_create(TOK_INT, 0, 1);
for (uint i = 1; i < num_tokens-1; i++) {
tokens[i] = token_create(TOK_PLUS, i, 1);
i++;
tokens[i] = token_create(TOK_INT, i, 1);
}
tokens_cursor_t cursor;
cursor.tokens = tokens;
cursor.index = 0;
cursor.length = num_tokens;
parse_tree_t* tree = NULL;
struct timeval then;
struct timeval now;
double elapsed;
gettimeofday(&then, NULL);
bool ret = parse(&cursor, &tree);
gettimeofday(&now, NULL);
assert(ret);
// thanks to https://stackoverflow.com/a/2150334/7543271
elapsed = (now.tv_sec - then.tv_sec) + (now.tv_usec - then.tv_usec) / 1000000.0;
printf("\ntest4: n=%i, elapsed time: %f seconds (%.0f tokens/sec)\n", num_tokens, elapsed, num_tokens/elapsed);
parse_tree_free(&tree);
}
int main() {
test1();
test2();
test3();
test4();
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment