Last active
November 24, 2016 23:16
-
-
Save sternenseemann/63dde02d7b103c9490c17e0bff8221e9 to your computer and use it in GitHub Desktop.
Appearantly a lisp tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <ctype.h> | |
#include <sys/mman.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#define TOKENS_CHUNK 50 | |
/* Tokenizer state */ | |
enum tokenizer_state { | |
NOT_IN_TOKEN, | |
IN_TOKEN | |
}; | |
/* Tokens */ | |
enum token_kind { | |
OPENING_PAREN, | |
CLOSING_PAREN, | |
ATOM | |
}; | |
typedef struct token { | |
enum token_kind type; | |
size_t offset; | |
} token; | |
/* Tokenization result */ | |
enum tokenize_status { | |
ALLOC_ERROR, | |
PARSE_ERROR, | |
SUCCESS | |
}; | |
typedef struct tokenize_result { | |
enum tokenize_status status; | |
token *tokens; | |
size_t length; | |
} tokenize_result; | |
void append_token(tokenize_result *res, enum token_kind type, size_t offset) { | |
res->tokens[res->length].type = type; | |
res->tokens[res->length].offset = offset; | |
res->length += 1; | |
} | |
tokenize_result tokenize(char *input) { | |
/* current char index */ | |
size_t i = 0; | |
enum tokenizer_state state = NOT_IN_TOKEN; | |
tokenize_result result = {0}; | |
/* allocated size of the array */ | |
size_t length = TOKENS_CHUNK; | |
/* length of the used part */ | |
result.length = 0; | |
result.status = SUCCESS; | |
result.tokens = malloc(sizeof (token) * length); | |
if(result.tokens == NULL) { | |
result.status = ALLOC_ERROR; | |
return result; | |
} | |
while (input[i] != '\0') { | |
if(result.length >= length) { | |
length = length + TOKENS_CHUNK; | |
result.tokens = realloc(result.tokens, sizeof (token) * length); | |
if(result.tokens == NULL) { | |
result.status = ALLOC_ERROR; | |
return result; | |
} | |
} | |
switch(input[i]) { | |
case '(': | |
append_token(&result, OPENING_PAREN, i); | |
state = NOT_IN_TOKEN; | |
break; | |
case ')': | |
append_token(&result, CLOSING_PAREN, i); | |
state = NOT_IN_TOKEN; | |
break; | |
case ';': | |
while(input[i] != '\0' && input[i] != '\n') { | |
i++; | |
} | |
state = NOT_IN_TOKEN; | |
break; | |
case '\t': | |
case '\r': | |
case '\n': | |
case ' ': | |
state = NOT_IN_TOKEN; | |
break; | |
default: | |
if(state == NOT_IN_TOKEN) { | |
append_token(&result, ATOM, i); | |
} | |
state = IN_TOKEN; | |
break; | |
} | |
i++; | |
} | |
return result; | |
} | |
int main(int argc, char *argv[]) { | |
if(argc != 2) { | |
fprintf(stderr, "Usage: %s <file>\n", argv[0]); | |
return 1; | |
} | |
int fd; | |
char *input; | |
struct stat stat; | |
fd = open(argv[1], O_RDONLY); | |
if(fd == -1) { | |
fprintf(stderr, "Could not open file \"%s\"", argv[1]); | |
return 1; | |
} | |
if (fstat(fd, &stat) == -1) { | |
fprintf(stderr, "Could not stat file \"%s\"", argv[1]); | |
return 1; | |
} | |
input = mmap(NULL, stat.st_size, PROT_READ, MAP_PRIVATE, fd, 0); | |
if (input == MAP_FAILED) { | |
printf("Could not read file \"%s\"\n", argv[1]); | |
} else { | |
printf("-- input --\n %s\n-- input --\n\n", input); | |
tokenize_result res = tokenize(input); | |
if(res.status == SUCCESS) { | |
for(size_t i = 0; i < res.length; i++) { | |
switch(res.tokens[i].type) { | |
case ATOM: | |
for(size_t j = res.tokens[i].offset; | |
input[j] != '\0' && input[j] != '(' && input[j] != ')' && !isspace(input[j]); | |
j++) { | |
printf("%c", input[j]); | |
} | |
printf("\n"); | |
break; | |
default: | |
printf("%c\n", input[res.tokens[i].offset]); | |
} | |
} | |
free(res.tokens); | |
} else { | |
fprintf(stderr, "An error occurend: %u\n", res.status); | |
if(res.tokens != NULL) { | |
free(res.tokens); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment