Skip to content

Instantly share code, notes, and snippets.

@ecnelises
Last active January 13, 2016 10:00
Show Gist options
  • Save ecnelises/50b318a44fc743b2182b to your computer and use it in GitHub Desktop.
Save ecnelises/50b318a44fc743b2182b to your computer and use it in GitHub Desktop.
支持基本正则运算的正则引擎
/*
* regex.c
* Qiu Chaofan, 2016/1/12
*
* Regular expression engine.
*
* Features:
* Support matching for | * ? + . ()
* No error handling is considered yet.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#define STATE_ANYCHAR -7 /* . */
#define STATE_OR -2 /* | */
#define STATE_QMARK -3 /* ? */
#define STATE_ASTERISK -4 /* * */
#define STATE_PLUS -5 /* + */
#define STATE_JOIN -6 /* ( */
/*
* Use of STATE_JOIN is for tackling parentheses.
* For example, if we parse '(a(b|c)d)', it should be like this:
* a
* |
* ---
* | |
* JOIN NULL
* | |
* d OR
* ||
* bc
*/
struct state {
int c;
struct state *lc;
struct state *rc;
};
typedef struct state * qregex_t;
qregex_t qregex_start(const char *pattern);
void qregex_end(qregex_t item);
void qregex_match_print(const char *text, qregex_t pattern);
struct state *parse_regex(const char *pattern, const char *end);
const char *match_from_tree_end(const char *text, struct state *pattern_tree);
qregex_t qregex_start(const char *pattern)
{
return parse_regex(pattern, pattern + strlen(pattern));
}
void qregex_end(qregex_t item)
{
if (item == NULL) {
return;
}
qregex_end(item->lc);
qregex_end(item->rc);
free(item);
}
void qregex_match_print(const char *text, qregex_t pattern)
{
if (pattern == NULL) {
puts(text);
}
const char *tmp = NULL;
while (*text != '\0') {
if ((tmp = match_from_tree_end(text, pattern)) == NULL) {
++text;
} else {
while (text != tmp) {
putchar(*text++);
}
putchar('\n');
}
}
}
/*
* Generate a regex tree from a regex string.
*/
struct state *parse_regex(const char *pattern, const char *end)
{
if (pattern == end) {
return NULL;
}
bool once_out = false;
int paren = 0;
struct state *current = NULL, *tmp = NULL;
const char *or_pos = NULL;
if (*pattern != '(') {
once_out = true;
}
/* Find whether there's an 'or' sign. */
for (const char *i = pattern; i < end; ++i) {
if (*i == '(') {
++paren;
} else if (*i == ')') {
--paren;
} else if (*i == '|' && paren == 0) {
or_pos = i;
once_out = true;
break;
}
if (paren == 0 && i < end - 1) {
once_out = true;
}
}
if (or_pos != NULL) {
current = malloc(sizeof(struct state));
current->c = STATE_OR;
current->lc = parse_regex(pattern, or_pos);
current->rc = parse_regex(++or_pos, end);
return current;
} else if (!once_out) {
return parse_regex(++pattern, --end);
} else {
if (*pattern == '(') {
const char *find_rparen = pattern;
paren = 0;
for (find_rparen = pattern; find_rparen < end; ++find_rparen) {
if (*find_rparen == '(') {
++paren;
} else if (*find_rparen == ')') {
if (--paren == 0) {
break;
}
}
}
tmp = parse_regex(++pattern, find_rparen);
current = malloc(sizeof(struct state));
current->c = STATE_JOIN;
current->rc = tmp;
current->lc = NULL;
pattern = ++find_rparen;
} else {
current = malloc(sizeof(struct state));
if (*pattern == '.') {
current->c = STATE_ANYCHAR;
++pattern;
} else {
current->c = *pattern++;
}
current->rc = NULL;
}
if (*pattern != '\0' && strchr("+*?", *pattern) != NULL) {
tmp = current;
tmp->lc = NULL;
current = malloc(sizeof(struct state));
if (*pattern == '+') {
current->c = STATE_PLUS;
} else if (*pattern == '*') {
current->c = STATE_ASTERISK;
} else if (*pattern == '?') {
current->c = STATE_QMARK;
}
current->lc = tmp;
current->rc = parse_regex(++pattern, end);
} else {
current->lc = parse_regex(pattern, end);
}
}
return current;
}
/*
* Return the end of matching.
* If matching failed, return null.
*/
const char *match_from_tree_end(const char *text, struct state *pattern_tree)
{
if (pattern_tree == NULL) {
return text;
}
const char *res = NULL;
switch (pattern_tree->c) {
case STATE_OR:
if ((res = match_from_tree_end(text, pattern_tree->lc)) != NULL) {
return res;
} else {
return match_from_tree_end(text, pattern_tree->rc);
}
case STATE_QMARK:
res = text;
if ((res = match_from_tree_end(res, pattern_tree->lc)) != NULL) {
text = res;
}
return match_from_tree_end(text, pattern_tree->rc);
case STATE_PLUS:
if ((text = match_from_tree_end(text, pattern_tree->lc)) == NULL) {
return NULL;
}
/* fall through */
case STATE_ASTERISK:
res = text;
while ((res = match_from_tree_end(res, pattern_tree->lc)) != NULL) {
text = res;
}
return match_from_tree_end(text, pattern_tree->rc);
case STATE_JOIN:
if ((text = match_from_tree_end(text, pattern_tree->rc)) == NULL) {
return NULL;
}
return match_from_tree_end(text, pattern_tree->lc);
case STATE_ANYCHAR:
if (*text != '\0') {
return match_from_tree_end(++text, pattern_tree->lc);
} else {
return NULL;
}
default:
if (*text == pattern_tree->c) {
return match_from_tree_end(++text, pattern_tree->lc);
} else {
return NULL;
}
}
}
int main(int argc, char *argv[])
{
qregex_t s = qregex_start("colo.r");
qregex_match_print("color twinkle colour cowl colou colorr", s);
/*
* If it runs well, it will print:
*
* colour
* colorr
*
* into the screen.
*/
qregex_end(s);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment