Skip to content

Instantly share code, notes, and snippets.

@davidmaamoaix
Created August 13, 2023 07:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidmaamoaix/ae9b3f920ed0347285fc13cb5af7757e to your computer and use it in GitHub Desktop.
Save davidmaamoaix/ae9b3f920ed0347285fc13cb5af7757e to your computer and use it in GitHub Desktop.
A mini regular expression engine written in C0.
#use <conio>
#use <string>
/*
Walmart version of a regex matcher. Only supports the following regex:
- character class (e.g. '[a-zA-Z0-9_]', '[^a-z]', '[531-]')
- choice '|'
- match any '.'
- match one or more '+'
- match any times '*'
- optional '?'
- shorthands: '\d', '\D', '\w', '\W', '\s', '\S'
(TODO: do the "match m-n occurences" thing)
After including this file, the following functions can be used:
- `re_match_string`: determines whether a given regex matches a given
input completely (i.e. no trailing input after DFA finishes)
- `re_match_length`: returns the length of the substring (starting from
the given `begin_index`) in the input that matches the given regex
Some notes:
- shorthands and escaped characters should be typed with two backslashes
like: '\\d', '\\?' (one backslash isn't valid C0 syntax for character
literals)
- special characters such as '\t', '\n' and '\r' uses one backslash (as
C0 supports them)
- all quantifiers are greedy
- everything only works on ASCII inputs
- capturing group also doesn't exist (parentheses are used for operator
precedence only)
*/
struct re_node_t;
struct re_edge_t {
bool is_epsilon;
struct re_node_t *target;
bool[] bucket_set;
};
typedef struct re_edge_t re_edge_t;
/*struct re_node_t {
int state_id;
bool is_term_state;
int n_edges;
re_edge_t[] edges;
};
typedef struct re_node_t re_node_t;*/
struct re_state_t {
bool is_term_state;
int[][] edges; // edges[ord][<number of to-states>]
int[] n_edges; // n_edges[ord] = <number of resulting states>
};
typedef struct re_state_t re_state_t;
struct re_ctx_t {
int pc;
string regex;
int n_states;
re_state_t *[] states;
bool is_test; // true if doing lookahead; do not alter ctx
};
typedef struct re_ctx_t re_ctx_t;
struct re_dfa_t {
int n_states;
re_state_t *[] states;
};
typedef struct re_dfa_t re_dfa_t;
struct re_match_ctx_t {
int pc;
string input;
int n_active;
bool[] active;
int last_term; // last index that reaches the terminal state of the DFA
};
typedef struct re_match_ctx_t re_match_ctx_t;
bool re_parse_or(re_ctx_t *ctx, int from, int to);
bool re_parse_seq(re_ctx_t *ctx, int from, int to);
bool re_parse_term(re_ctx_t *ctx, int from, int to);
bool re_parse_atom(re_ctx_t *ctx, int from, int to);
bool re_parse_class(re_ctx_t *ctx, int from, int to);
// dumb shorthand since expression in an assertion must be pure
void re_assert(bool cond) {
//@assert cond;
}
int re_new_state(re_ctx_t *ctx, bool is_term_state) {
if (ctx->is_test) return -1;
int state_id = ctx->n_states;
re_state_t *state = alloc(re_state_t);
state->is_term_state = is_term_state;
state->edges = alloc_array(int[], 129); // 128 for ASCII, 1 for epsilon
state->n_edges = alloc_array(int, 129);
// record state into id -> state mapping
ctx->states[state_id] = state;
ctx->n_states++;
return state_id;
}
void re_lazy_init_transition(re_state_t *from_state, int chr) {
if (from_state->n_edges[chr] == 0) {
from_state->edges[chr] = alloc_array(int, 5);
}
}
/* re_ctx_t *re_dummy_ctx(re_ctx_t *real_ctx) {
re_ctx_t *dummy = alloc(re_ctx_t);
dummy->n_states = 0;
dummy->regex = real_ctx->regex;
dummy->pc = real_ctx->pc;
dummy->is_dummy = true;
return dummy;
}*/
bool re_eof_ctx(re_ctx_t *ctx)
//@requires ctx->pc <= string_length(ctx->regex);
{
return ctx->pc == string_length(ctx->regex);
}
char re_peek_ctx(re_ctx_t *ctx)
//@requires !re_eof_ctx(ctx);
{
return string_charat(ctx->regex, ctx->pc);
}
bool re_is_sep_char(re_ctx_t *ctx)
//@requires ctx->pc <= string_length(ctx->regex);
{
if (re_eof_ctx(ctx)) return true;
char peek = re_peek_ctx(ctx);
return peek == ')' || peek == '|';
}
bool re_consume(char expect, re_ctx_t *ctx)
//@requires ctx->pc <= string_length(ctx->regex);
{
// explicit check for more explicit error message (thus the loose @requires)
if (re_eof_ctx(ctx)) {
println("Regex Syntax Error: unexpected end of input");
return false;
}
char peek = re_peek_ctx(ctx);
bool match = peek == expect;
if (match) ctx->pc++;
else {
print("Regex Syntax Error: unexpected character '");
printchar(peek);
print("', expecting '");
printchar(expect);
println("'");
}
return match;
}
void re_add_epsilon(re_ctx_t *ctx, int from, int to) {
if (ctx->is_test) return;
re_state_t *from_state = ctx->states[from];
re_lazy_init_transition(from_state, 128);
from_state->edges[128][from_state->n_edges[128]] = to;
from_state->n_edges[128]++;
}
bool re_consume_class_char(re_ctx_t *ctx, char *out) {
char peek = re_peek_ctx(ctx);
ctx->pc++;
if (peek == '\\') {
if (re_eof_ctx(ctx)) {
println("Regex Syntax Error: unexpected EOF at escaped char");
return false;
}
peek = re_peek_ctx(ctx);
ctx->pc++;
if (peek != ']' && peek != '-' && peek != '^' && peek != '\\') {
print("Regex Syntax Error: unexpected escaped char '");
printchar(peek);
println("' in character class, expecting ']', '-' or '^'");
return false;
}
}
*out = peek;
return true;
}
void re_char_bucket_invert(bool[] bucket)
//@requires \length(bucket) == 128;
{
for (int i = 0; i < 128; i++) bucket[i] = !bucket[i];
}
void re_char_bucket_digits(bool[] bucket)
//@requires \length(bucket) == 128;
{
for (int i = 48; i < 48 + 10; i++) bucket[i] = true;
}
void re_char_bucket_alpha(bool[] bucket)
//@requires \length(bucket) == 128;
{
for (int i = 65; i < 65 + 26; i++) bucket[i] = true;
for (int i = 97; i < 97 + 26; i++) bucket[i] = true;
}
void re_char_bucket_whitespace(bool[] bucket)
//@requires \length(bucket) == 128;
{
bucket[char_ord(' ')] = true;
bucket[char_ord('\f')] = true;
bucket[char_ord('\n')] = true;
bucket[char_ord('\r')] = true;
bucket[char_ord('\t')] = true;
bucket[char_ord('\v')] = true;
}
bool re_parse_class(re_ctx_t *ctx, int from, int to) {
re_consume('[', ctx);
if (re_eof_ctx(ctx)) {
println("Regex Syntax Error: unexpected EOF before closing ']'");
return false;
}
bool inverted = false;
if (re_peek_ctx(ctx) == '^') {
inverted = true;
ctx->pc++;
}
bool[] char_bucket = alloc_array(bool, 128);
while (!re_eof_ctx(ctx) && re_peek_ctx(ctx) != ']') {
char *peek_ptr = alloc(char);
if (!re_consume_class_char(ctx, peek_ptr)) return false;
// parse character range
if (
ctx->pc + 1 < string_length(ctx->regex) &&
re_peek_ctx(ctx) == '-' &&
string_charat(ctx->regex, ctx->pc + 1) != ']'
) {
ctx->pc++;
char *to_ptr = alloc(char);
if (!re_consume_class_char(ctx, to_ptr)) return false;
int from_char = char_ord(*peek_ptr);
int to_char = char_ord(*to_ptr);
if (to_char < from_char) {
print("Regex Syntax Error: bad character range [");
printchar(*peek_ptr);
printchar('-');
printchar(*to_ptr);
println("]");
}
for (int i = from_char; i <= to_char; i++) {
char_bucket[i] = true;
}
} else {
char_bucket[char_ord(*peek_ptr)] = true;
}
}
if (inverted) re_char_bucket_invert(char_bucket);
if (!ctx->is_test) {
re_state_t *from_state = ctx->states[from];
for (int chr = 0; chr < 128; chr++) {
if (char_bucket[chr]) {
re_lazy_init_transition(from_state, chr);
from_state->edges[chr][from_state->n_edges[chr]] = to;
from_state->n_edges[chr]++;
}
}
}
re_consume(']', ctx);
return true;
}
bool re_parse_atom(re_ctx_t *ctx, int from, int to)
//@requires !re_eof_ctx(ctx);
{
char peek = re_peek_ctx(ctx);
if (peek == '(') {
ctx->pc++;
if (!re_parse_or(ctx, from, to)) return false;
if (!re_consume(')', ctx)) return false;
} else if (peek == '[') {
if (!re_parse_class(ctx, from, to)) return false;
} else if (peek == '?' || peek == '+' || peek == '*') {
println("Regex Syntax Error: misplaced '?', '*' or '+'");
return false;
} else {
ctx->pc++;
bool[] bucket = alloc_array(bool, 128);
if (peek == '\\') {
peek = re_peek_ctx(ctx);
if (
peek != '+' && peek != '.' && peek != '*' &&
peek != '?' && peek != '|' && peek != '\\' &&
peek != '(' && peek != ')' && peek != '[' &&
peek != ']' && peek != 'd' && peek != 'D' &&
peek != 'w' && peek != 'W' && peek != 's' &&
peek != 'S'
) {
print("Regex Syntax Error: invalid escaped char '\\");
printchar(peek);
println("'");
return false;
}
if (peek == 'd') {
re_char_bucket_digits(bucket);
} else if (peek == 'D') {
re_char_bucket_digits(bucket);
re_char_bucket_invert(bucket);
} else if (peek == 'w') {
bucket[char_ord('_')] = true;
re_char_bucket_alpha(bucket);
re_char_bucket_digits(bucket);
} else if (peek == 'W') {
bucket[char_ord('_')] = true;
re_char_bucket_alpha(bucket);
re_char_bucket_digits(bucket);
re_char_bucket_invert(bucket);
} else if (peek == 's') {
re_char_bucket_whitespace(bucket);
} else if (peek == 'S') {
re_char_bucket_whitespace(bucket);
re_char_bucket_invert(bucket);
} else {
bucket[char_ord(peek)] = true;
}
ctx->pc++;
} else {
bucket[char_ord(peek)] = true;
}
if (!ctx->is_test) {
re_state_t *from_state = ctx->states[from];
for (int chr = 0; chr < 128; chr++) {
if (bucket[chr]) {
re_lazy_init_transition(from_state, chr);
from_state->edges[chr][from_state->n_edges[chr]] = to;
from_state->n_edges[chr]++;
}
}
}
}
return true;
}
// term := group | group postfix
bool re_parse_term(re_ctx_t *ctx, int from, int to)
//@requires !re_eof_ctx(ctx);
{
// in case of '+', the same edge needs to be generated twice
int pc_store = ctx->pc;
bool test_store = ctx->is_test;
ctx->is_test = true;
int dummy_from = re_new_state(ctx, false);
int dummy_to = re_new_state(ctx, false);
if (!re_parse_atom(ctx, dummy_from, dummy_to)) return false;
ctx->is_test = test_store;
if (re_eof_ctx(ctx)) {
ctx->pc = pc_store;
return re_parse_atom(ctx, from, to);
}
char peek = re_peek_ctx(ctx);
ctx->pc = pc_store;
if (peek == '?') {
re_add_epsilon(ctx, from, to);
re_assert(re_parse_atom(ctx, from, to));
ctx->pc++;
} else if (peek == '+') {
int interm = re_new_state(ctx, false);
re_assert(re_parse_atom(ctx, from, interm));
ctx->pc = pc_store;
re_assert(re_parse_atom(ctx, interm, interm));
re_add_epsilon(ctx, interm, to);
ctx->pc++;
} else if (peek == '*') {
int interm = re_new_state(ctx, false);
re_assert(re_parse_atom(ctx, interm, interm));
re_add_epsilon(ctx, from, interm);
re_add_epsilon(ctx, interm, to);
ctx->pc++;
} else {
re_assert(re_parse_atom(ctx, from, to));
}
return true;
}
// seq := term | term seq | ''
bool re_parse_seq(re_ctx_t *ctx, int from, int to)
//@requires ctx->pc <= string_length(ctx->regex);
{
if (re_is_sep_char(ctx)) {
re_add_epsilon(ctx, from, to);
return true;
}
// ugly backtracking to eliminate one unnecessary epsilon transition
int pc_store = ctx->pc;
bool test_store = ctx->is_test;
ctx->is_test = true;
int dummy_from = re_new_state(ctx, false);
int dummy_to = re_new_state(ctx, false);
if (!re_parse_term(ctx, dummy_from, dummy_to)) return false;
ctx->is_test = test_store;
if (re_is_sep_char(ctx)) {
ctx->pc = pc_store;
return re_parse_term(ctx, from, to);
} else {
ctx->pc = pc_store;
int next = re_new_state(ctx, false);
return re_parse_term(ctx, from, next) && re_parse_seq(ctx, next, to);
}
}
// or := seq | seq '|' or
bool re_parse_or(re_ctx_t *ctx, int from, int to)
//@requires ctx->pc <= string_length(ctx->regex);
{
if (!re_parse_seq(ctx, from, to)) return false;
while (!re_eof_ctx(ctx) && re_peek_ctx(ctx) == '|') {
ctx->pc++;
if (!re_parse_seq(ctx, from, to)) return false;
}
return true;
}
re_dfa_t *re_parse_regex(string regex) {
re_ctx_t *ctx = alloc(re_ctx_t);
ctx->n_states = 0;
ctx->states = alloc_array(re_state_t *, 6969);
ctx->pc = 0;
ctx->regex = regex;
int start = re_new_state(ctx, false);
int end = re_new_state(ctx, true);
if (!re_parse_or(ctx, start, end)) {
println("REGEX PARSE FAILED");
return NULL;
}
if (!re_eof_ctx(ctx)) {
print("Regex Syntax Error: parse ended at char ");
printint(ctx->pc);
printchar('\n');
println("REGEX PARSE FAILED");
return NULL;
}
re_dfa_t *dfa = alloc(re_dfa_t);
dfa->n_states = ctx->n_states;
dfa->states = ctx->states;
return dfa;
}
void re_print_char(int ord) {
if (ord == 128) {
// 128 reserved for epsilon
print("<epsilon>");
} else {
char chr = char_chr(ord);
if (chr == ' ') print("<space>");
else if (chr == '\t') print("<tab>");
else if (chr == '\n') print("<br>");
else printchar(chr);
}
}
void re_print_state(int id, re_dfa_t *dfa) {
re_state_t *state = dfa->states[id];
print("State ");
printint(id);
if (state->is_term_state) print(" (FINISH)");
println(":");
int total_edges = 0;
for (int i = 0; i < 129; i++) total_edges += state->n_edges[i];
if (total_edges == 0) {
println("\t(No out-going transitions)");
} else {
bool[] state_bucket = alloc_array(bool, dfa->n_states);
bool[][] trans_bucket = alloc_array(bool[], dfa->n_states);
for (int chr = 0; chr < 129; chr++) {
for (int e = 0; e < state->n_edges[chr]; e++) {
int to_state = state->edges[chr][e];
if (!state_bucket[to_state]) {
trans_bucket[to_state] = alloc_array(bool, 129);
}
state_bucket[to_state] = true;
trans_bucket[to_state][chr] = true;
}
}
for (int i = 0; i < dfa->n_states; i++) {
if (state_bucket[i]) {
print("\tState ");
printint(i);
print(": ");
for (int chr = 0; chr < 129; chr++) {
if (trans_bucket[i][chr]) {
re_print_char(chr);
print(" ");
}
}
println("");
}
}
}
println("");
}
void re_print_dfa(re_dfa_t *re) {
print("Printing DFA with ");
printint(re->n_states);
println(" States");
println("");
for (int i = 0; i < re->n_states; i++) re_print_state(i, re);
println("END OF DFA");
}
/*
Performs all epsilon transitions from state `id`. Returns the amount of
added active states as a result of this action.
*/
int re_prop_epsilon(int id, re_dfa_t *re, bool[] active, re_match_ctx_t *ctx) {
re_state_t *state = re->states[id];
int n_new_states = 0;
for (int i = 0; i < state->n_edges[128]; i++) {
int to = state->edges[128][i];
if (!active[to]) {
active[to] = true;
n_new_states++;
n_new_states += re_prop_epsilon(to, re, active, ctx);
if (re->states[to]->is_term_state) {
ctx->last_term = ctx->pc + 1;
}
}
}
return n_new_states;
}
/*
Matches the given regular expression object on the given input, starting at
index `begin_index`. Returns the length of the matched sequence starting
from the given position.
Example:
re = "a(b|c)d"
input = "acdef"
begin_index = 0
This returns 3 as the DFA only matches the prefix "acd".
*/
int re_run_dfa(re_dfa_t *re, string input, int begin_index) {
re_match_ctx_t *ctx = alloc(re_match_ctx_t);
ctx->active = alloc_array(bool, re->n_states);
ctx->active[0] = true;
ctx->n_active = 1;
ctx->input = input;
// `re_prop_epsilon` records a terminal state as `ctx->pc + 1`, so setting
// `ctx->pc` to -1 is a quick hack to prevent `re_prop_epsilon` from
// recording 0 + 1 = 1 during the epsilon propagation of initial states
ctx->pc = -1;
ctx->last_term = 0;
ctx->n_active += re_prop_epsilon(0, re, ctx->active, ctx);
for (
ctx->pc = 0;
ctx->pc + begin_index < string_length(ctx->input) && ctx->n_active != 0;
ctx->pc++
) {
int n_new_states = 0;
bool[] new_states = alloc_array(bool, re->n_states);
int curr = char_ord(string_charat(input, ctx->pc + begin_index));
// for all currently active states, perform its transformation
for (int i = 0; i < re->n_states; i++) {
if (ctx->active[i]) {
for (int e = 0; e < re->states[i]->n_edges[curr]; e++) {
int to_state = re->states[i]->edges[curr][e];
if (!new_states[to_state]) {
new_states[to_state] = true;
n_new_states++;
n_new_states += re_prop_epsilon(
to_state, re, new_states, ctx
);
if (re->states[to_state]->is_term_state) {
ctx->last_term = ctx->pc + 1;
}
}
}
}
}
ctx->n_active = n_new_states;
ctx->active = new_states;
}
return ctx->last_term;
}
/*
Determines whether the given input `input` matches the regular expression
`regex`.
*/
bool re_match_string(string regex, string input) {
re_dfa_t *re = re_parse_regex(regex);
if (re == NULL) return false; // regex error; see stdout for error message
return re_run_dfa(re, input, 0) == string_length(input);
}
/*
Returns the length of the substring (starting from the given `begin_index`)
in the given input that matches the given regex.
*/
int re_match_length(string regex, string input, int begin_index) {
re_dfa_t *re = re_parse_regex(regex);
if (re == NULL) return -1; // regex error; see stdout for error message
return re_run_dfa(re, input, begin_index);
}
int main() {
//@assert re_match_string("a|ac|dc", "a");
//@assert !re_match_string("a|ac|dc", "d");
//@assert re_match_string("[a-zA-Z_][a-zA-Z0-9_]*", "sCoTtY_dOg");
//@assert re_match_string("\\s*[a-zA-Z_][a-zA-Z0-9_]*", " \tsCoTtY_dOg");
//@assert !re_match_string("[a-zA-Z_][a-zA-Z0-9_]*", "1SCOTTY");
//@assert re_match_string("abcdef|(gh)+ijkl", "abcdef");
//@assert re_match_string("abcdef|(gh)+ijkl", "ghghghghghijkl");
//@assert re_match_string("(a1(cd|ef|gh))", "a1cd");
//@assert !re_match_string("abcdef|ghijkl", "abdef");
//@assert re_match_length("[a-z]+", "%&*carnegie72", 3) == 8;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment