Blog 2018/11/5
<- previous | index | next ->
Blog 2018/11/5
<- previous | index | next ->
test: parse | |
./parse | |
parse: parse.c | |
gcc -Wall -Werror -o parse parse.c | |
clean: | |
rm -f parse | |
.PHONY: clean |
$ make | |
./parse | |
test1: 1 | |
<EXPR> | |
<INT(1)> | |
test2: 1+2 | |
<EXPR> | |
<INT(1)> | |
<PLUS> | |
<INT(2)> | |
test3: 1+23+7 | |
<EXPR> | |
<INT(1)> | |
<PLUS> | |
<INT(23)> | |
<PLUS> | |
<INT(7)> | |
test4: n=1000001, elapsed time: 0.090578 seconds (11040219 tokens/sec) | |
$ cat /proc/cpuinfo | grep name | head -n1 | |
model name : Intel(R) Core(TM) i3-2350M CPU @ 2.30GHz |
$ make | |
./parse | |
test1: 1 | |
<EXPR> | |
<INT(1)> | |
test2: 1+2 | |
<EXPR> | |
<INT(1)> | |
<PLUS> | |
<INT(2)> | |
test3: 1+23+7 | |
<EXPR> | |
<INT(1)> | |
<PLUS> | |
<INT(23)> | |
<PLUS> | |
<INT(7)> | |
test4: n=1000001, elapsed time: 0.163313 seconds (6123217 tokens/sec) | |
$ cat /proc/cpuinfo | grep name | head -n1 | |
model name : Intel(R) Core(TM) i3-2350M CPU @ 2.30GHz |
// Grammar 1, using "regex"-style notation: | |
// | |
// <expr> -> <int> ( <plus> <int> )* | |
// | |
// Grammar 2, using left-recursion (not compatible with recursive descent): | |
// | |
// <expr> -> <expr> <plus> <int> | |
// | <int> | |
// | |
// Grammar 3, using right-recursion: | |
// | |
// <expr> -> <int> <expr2> | |
// <expr2> -> <plus> <int> <expr2> | |
// | <> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <assert.h> | |
#include <string.h> // for memcpy() | |
#include <stdio.h> // for fprintf() | |
void pindent(uint indent) { | |
for (uint i = 0; i < indent; i++) { | |
printf(" "); | |
} | |
} | |
// MARK: - token_type_t | |
enum _token_type_t { | |
TOK_INT = 0, | |
TOK_PLUS, | |
token_type_t_NUM_CASES, | |
}; | |
typedef enum _token_type_t token_type_t; | |
char* token_type_names[token_type_t_NUM_CASES] = { | |
"INT", | |
"PLUS" | |
}; | |
// MARK: - token_t | |
struct _token_t { | |
token_type_t type; | |
size_t offset; | |
size_t length; | |
}; | |
typedef struct _token_t token_t; | |
token_t* token_create(token_type_t type, size_t offset, size_t length) { | |
token_t* t = (token_t*)malloc(sizeof(token_t)); | |
assert(t != NULL); | |
t->type = type; | |
t->offset = offset; | |
t->length = length; | |
return t; | |
} | |
// MARK: - tokens_cursor_t | |
struct _tokens_cursor_t { | |
token_t** tokens; | |
size_t index; | |
size_t length; | |
}; | |
typedef struct _tokens_cursor_t tokens_cursor_t; | |
// to be run from within each tokens_cursor_t mutation function. | |
void _tokens_cursor_sanity_check(tokens_cursor_t* c) { | |
assert(c != NULL); | |
assert(c->tokens != NULL); | |
assert(!(c->index > c->length)); | |
} | |
tokens_cursor_t* tokens_cursor_create(token_t** tokens, size_t length) { | |
tokens_cursor_t* c = (tokens_cursor_t*)malloc(sizeof(tokens_cursor_t)); | |
assert(c != NULL); | |
c->tokens = tokens; | |
c->length = length; | |
c->index = 0; | |
_tokens_cursor_sanity_check(c); | |
return c; | |
} | |
bool tokens_cursor_eof(tokens_cursor_t* c) { | |
return c->index == c->length; | |
} | |
token_t* tokens_cursor_peek(tokens_cursor_t* c) { | |
if (tokens_cursor_eof(c)) { | |
return NULL; | |
} else { | |
return c->tokens[c->index]; | |
} | |
} | |
bool tokens_cursor_try_consume(tokens_cursor_t* c, token_type_t type, token_t** token_out) { | |
token_t* token = tokens_cursor_peek(c); | |
if (token == NULL || token->type != type) { | |
return false; | |
} | |
c->index += 1; | |
_tokens_cursor_sanity_check(c); | |
*token_out = token; | |
return true; | |
} | |
void tokens_cursor_save(tokens_cursor_t* c, tokens_cursor_t* savepoint) { | |
memcpy(savepoint, c, sizeof(tokens_cursor_t)); | |
} | |
void tokens_cursor_restore(tokens_cursor_t* c, tokens_cursor_t* savepoint) { | |
memcpy(c, savepoint, sizeof(tokens_cursor_t)); | |
} | |
// MARK: - parse_tree_type_t | |
enum _parse_tree_type_t { | |
PTREE_EXPR = 0, | |
PTREE_INT, | |
PTREE_PLUS, | |
parse_tree_type_t_NUM_CASES, | |
}; | |
typedef enum _parse_tree_type_t parse_tree_type_t; | |
char* parse_tree_type_names[parse_tree_type_t_NUM_CASES] = { | |
"EXPR", | |
"INT", | |
"PLUS" | |
}; | |
// MARK: - parse_tree_t | |
struct _parse_tree_list_t { | |
struct _parse_tree_t* tree; | |
struct _parse_tree_list_t* next; | |
}; | |
typedef struct _parse_tree_list_t parse_tree_list_t; | |
struct _parse_tree_t { | |
parse_tree_type_t type; | |
token_t* token; | |
struct _parse_tree_list_t* children; | |
struct _parse_tree_list_t* _tail; | |
}; | |
typedef struct _parse_tree_t parse_tree_t; | |
parse_tree_list_t* parse_tree_list_create(parse_tree_t* tree) { | |
parse_tree_list_t* child = (parse_tree_list_t*)malloc(sizeof(parse_tree_list_t)); | |
assert(child != NULL); | |
child->tree = tree; | |
child->next = NULL; | |
return child; | |
} | |
parse_tree_list_t* ptl_pool = NULL; | |
int ptl_pool_batch_size = 256; | |
int ptl_pool_count = 0; | |
int ptl_pool_next = 0; | |
parse_tree_list_t* parse_tree_list_create2(parse_tree_t* tree) { | |
if (ptl_pool_next == ptl_pool_count) { | |
ptl_pool = (parse_tree_list_t*)malloc(ptl_pool_batch_size * sizeof(parse_tree_list_t)); | |
ptl_pool_count = ptl_pool_batch_size; | |
ptl_pool_next = 0; | |
} | |
parse_tree_list_t* child = ptl_pool + ptl_pool_next; | |
ptl_pool_next++; | |
assert(child != NULL); | |
child->tree = tree; | |
child->next = NULL; | |
return child; | |
} | |
parse_tree_t* parse_tree_create(parse_tree_type_t type, token_t* token) { | |
parse_tree_t* t = (parse_tree_t*)malloc(sizeof(parse_tree_t)); | |
assert(t != NULL); | |
t->type = type; | |
t->token = token; | |
t->children = NULL; | |
return t; | |
} | |
parse_tree_t* pt_pool = NULL; | |
int pt_pool_batch_size = 256; | |
int pt_pool_count = 0; | |
int pt_pool_next = 0; | |
parse_tree_t* parse_tree_create2(parse_tree_type_t type, token_t* token) { | |
if (pt_pool_next == pt_pool_count) { | |
pt_pool = (parse_tree_t*)malloc(pt_pool_batch_size * sizeof(parse_tree_t)); | |
pt_pool_count = pt_pool_batch_size; | |
pt_pool_next = 0; | |
} | |
parse_tree_t* t = pt_pool + pt_pool_next; | |
pt_pool_next++; | |
assert(t != NULL); | |
t->type = type; | |
t->token = token; | |
t->children = NULL; | |
return t; | |
} | |
bool parse_tree_append(parse_tree_t* tree, parse_tree_type_t type, token_t* token) { | |
parse_tree_t* tree_node = parse_tree_create2(type, token); | |
parse_tree_list_t* list_node = parse_tree_list_create2(tree_node); | |
if (tree->children == NULL) { | |
tree->children = list_node; | |
} else { | |
tree->_tail->next = list_node; | |
} | |
tree->_tail = list_node; | |
return true; | |
} | |
void parse_tree_free(parse_tree_t** tree) { | |
// FIXME this needs to be reimplemented to handle pooled memory allocation | |
// parse_tree_list_t* list_node = (*tree)->children; | |
// parse_tree_list_t* next_node = NULL; | |
// while(list_node != NULL) { | |
// next_node = list_node->next; | |
// parse_tree_free(&(list_node->tree)); | |
// free(list_node); | |
// list_node = next_node; | |
// } | |
// free(*tree); | |
// *tree = NULL; | |
} | |
// MARK: - recursive-descent parser | |
bool _parse_terminal(tokens_cursor_t* c, parse_tree_t* t, token_type_t tk_type, parse_tree_type_t pt_type) { | |
token_t* token = NULL; | |
return tokens_cursor_try_consume(c, tk_type, &token) | |
&& parse_tree_append(t, pt_type, token); | |
} | |
bool _parse_plus(tokens_cursor_t* c, parse_tree_t* t) { | |
return _parse_terminal(c, t, TOK_PLUS, PTREE_PLUS); | |
} | |
bool _parse_int(tokens_cursor_t* c, parse_tree_t* t) { | |
return _parse_terminal(c, t, TOK_INT, PTREE_INT); | |
} | |
// <expr> -> <int> ( <plus> <int> )* | |
bool _parse_expr(tokens_cursor_t* c, parse_tree_t** t_out) { | |
parse_tree_t* ptree = parse_tree_create2(PTREE_EXPR, NULL); | |
if (!_parse_int(c, ptree)) { | |
return false; | |
} | |
tokens_cursor_t savepoint; | |
while (true) { | |
tokens_cursor_save(c, &savepoint); | |
if (!(_parse_plus(c, ptree) && _parse_int(c, ptree))) { | |
tokens_cursor_restore(c, &savepoint); | |
break; | |
} | |
} | |
*t_out = ptree; | |
return true; | |
} | |
bool parse(tokens_cursor_t* c, parse_tree_t** tree_out) { | |
return _parse_expr(c, tree_out); | |
} | |
// MARK: - Parse tree printing | |
void _parse_tree_print(parse_tree_t* tree, char* input, uint indent) { | |
parse_tree_list_t* children_list = NULL; | |
switch(tree->type) { | |
case PTREE_EXPR: | |
pindent(indent); | |
printf("<%s>\n", parse_tree_type_names[tree->type]); | |
children_list = tree->children; | |
while(children_list != NULL) { | |
_parse_tree_print(children_list->tree, input, indent + 1); | |
children_list = children_list->next; | |
} | |
break; | |
case PTREE_INT: | |
pindent(indent); | |
printf("<%s(", parse_tree_type_names[tree->type]); | |
char buf[128]; | |
assert(tree->token->length < sizeof(buf)); | |
snprintf(buf, tree->token->length + 1, input + tree->token->offset); | |
printf("%s", buf); | |
printf(")>\n"); | |
break; | |
case PTREE_PLUS: | |
pindent(indent); | |
printf("<%s>\n", parse_tree_type_names[tree->type]); | |
break; | |
default: | |
assert(false); | |
break; | |
} | |
} | |
void parse_tree_print(parse_tree_t* tree, char* input) { | |
_parse_tree_print(tree, input, 0); | |
} | |
// MARK: - main | |
void test1() { | |
char* input = "1"; | |
printf("\ntest1: %s\n", input); | |
token_t t1 = {TOK_INT,0,1}; | |
token_t* tokens[1] = {&t1}; | |
tokens_cursor_t cursor; | |
cursor.tokens = tokens; | |
cursor.index = 0; | |
cursor.length = 1; | |
parse_tree_t* tree = NULL; | |
assert(parse(&cursor, &tree)); | |
parse_tree_print(tree, input); | |
parse_tree_free(&tree); | |
} | |
void test2() { | |
char* input = "1+2"; | |
printf("\ntest2: %s\n", input); | |
token_t t1 = {TOK_INT, 0,1}; | |
token_t t2 = {TOK_PLUS,1,1}; | |
token_t t3 = {TOK_INT, 2,1}; | |
token_t* tokens[3] = {&t1,&t2,&t3}; | |
tokens_cursor_t cursor; | |
cursor.tokens = tokens; | |
cursor.index = 0; | |
cursor.length = 3; | |
parse_tree_t* tree = NULL; | |
assert(parse(&cursor, &tree)); | |
parse_tree_print(tree, input); | |
parse_tree_free(&tree); | |
} | |
void test3() { | |
char* input = "1+23+7"; | |
printf("\ntest3: %s\n", input); | |
token_t t1 = {TOK_INT, 0,1}; | |
token_t t2 = {TOK_PLUS,1,1}; | |
token_t t3 = {TOK_INT, 2,2}; | |
token_t t4 = {TOK_PLUS,4,1}; | |
token_t t5 = {TOK_INT, 5,1}; | |
token_t* tokens[5] = {&t1,&t2,&t3,&t4,&t5}; | |
tokens_cursor_t cursor; | |
cursor.tokens = tokens; | |
cursor.index = 0; | |
cursor.length = 5; | |
parse_tree_t* tree = NULL; | |
assert(parse(&cursor, &tree)); | |
parse_tree_print(tree, input); | |
parse_tree_free(&tree); | |
} | |
#include <sys/time.h> // for gettimeofdat() | |
void test4() { | |
uint num_tokens = 1000001; | |
token_t* tokens[num_tokens]; | |
tokens[0] = token_create(TOK_INT, 0, 1); | |
for (uint i = 1; i < num_tokens-1; i++) { | |
tokens[i] = token_create(TOK_PLUS, i, 1); | |
i++; | |
tokens[i] = token_create(TOK_INT, i, 1); | |
} | |
tokens_cursor_t cursor; | |
cursor.tokens = tokens; | |
cursor.index = 0; | |
cursor.length = num_tokens; | |
parse_tree_t* tree = NULL; | |
struct timeval then; | |
struct timeval now; | |
double elapsed; | |
gettimeofday(&then, NULL); | |
bool ret = parse(&cursor, &tree); | |
gettimeofday(&now, NULL); | |
assert(ret); | |
// thanks to https://stackoverflow.com/a/2150334/7543271 | |
elapsed = (now.tv_sec - then.tv_sec) + (now.tv_usec - then.tv_usec) / 1000000.0; | |
printf("\ntest4: n=%i, elapsed time: %f seconds (%.0f tokens/sec)\n", num_tokens, elapsed, num_tokens/elapsed); | |
parse_tree_free(&tree); | |
} | |
int main() { | |
test1(); | |
test2(); | |
test3(); | |
test4(); | |
return EXIT_SUCCESS; | |
} |