Skip to content

Instantly share code, notes, and snippets.

@luchy0120
Last active August 21, 2019 03:54
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luchy0120/badcc9ce807e359067b0e0b17271963b to your computer and use it in GitHub Desktop.
Save luchy0120/badcc9ce807e359067b0e0b17271963b to your computer and use it in GitHub Desktop.
NFA Thompson Construction by Russ Cox
/*
* Regular expression implementation.
* Supports only ( | ) * + ?. No escapes.
* Compiles to NFA and then simulates NFA
* using Thompson's algorithm.
*
* See also http://swtch.com/~rsc/regexp/ and
* Thompson, Ken. Regular Expression Search Algorithm,
* Communications of the ACM 11(6) (June 1968), pp. 419-422.
*
* Copyright (c) 2007 Russ Cox.
* Can be distributed under the MIT license, see bottom of file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
/*
* Convert infix regexp re to postfix notation.
* Insert . as explicit concatenation operator.
* Cheesy parser, return static buffer.
*/
char*
re2post(char *re)
{
int nalt, natom;
static char buf[8000];
char *dst;
struct {
int nalt;
int natom;
} paren[100], *p;
p = paren;
dst = buf;
nalt = 0;
natom = 0;
if(strlen(re) >= sizeof buf/2)
return NULL;
for(; *re; re++){
switch(*re){
case '(':
if(natom > 1){
--natom;
*dst++ = '.';
}
if(p >= paren+100)
return NULL;
p->nalt = nalt;
p->natom = natom;
p++;
nalt = 0;
natom = 0;
break;
case '|':
if(natom == 0)
return NULL;
while(--natom > 0)
*dst++ = '.';
nalt++;
break;
case ')':
if(p == paren)
return NULL;
if(natom == 0)
return NULL;
while(--natom > 0)
*dst++ = '.';
for(; nalt > 0; nalt--)
*dst++ = '|';
--p;
nalt = p->nalt;
natom = p->natom;
natom++;
break;
case '*':
case '+':
case '?':
if(natom == 0)
return NULL;
*dst++ = *re;
break;
default:
if(natom > 1){
--natom;
*dst++ = '.';
}
*dst++ = *re;
natom++;
break;
}
}
if(p != paren)
return NULL;
while(--natom > 0)
*dst++ = '.';
for(; nalt > 0; nalt--)
*dst++ = '|';
*dst = 0;
return buf;
}
/*
* Represents an NFA state plus zero or one or two arrows exiting.
* if c == Match, no arrows out; matching state.
* If c == Split, unlabeled arrows to out and out1 (if != NULL).
* If c < 256, labeled arrow with character c to out.
*/
enum
{
Match = 256,
Split = 257
};
typedef struct State State;
struct State
{
int c;
State *out;
State *out1;
int lastlist;
};
State matchstate = { Match }; /* matching state */
int nstate;
/* Allocate and initialize State */
State*
state(int c, State *out, State *out1)
{
State *s;
nstate++;
s = malloc(sizeof *s);
s->lastlist = 0;
s->c = c;
s->out = out;
s->out1 = out1;
return s;
}
/*
* A partially built NFA without the matching state filled in.
* Frag.start points at the start state.
* Frag.out is a list of places that need to be set to the
* next state for this fragment.
*/
typedef struct Frag Frag;
typedef union Ptrlist Ptrlist;
struct Frag
{
State *start;
Ptrlist *out;
};
/* Initialize Frag struct. */
Frag
frag(State *start, Ptrlist *out)
{
Frag n = { start, out };
return n;
}
/*
* Since the out pointers in the list are always
* uninitialized, we use the pointers themselves
* as storage for the Ptrlists.
*/
union Ptrlist
{
Ptrlist *next;
State *s;
};
/* Create singleton list containing just outp. */
Ptrlist*
list1(State **outp)
{
Ptrlist *l;
l = (Ptrlist*)outp;
l->next = NULL;
return l;
}
/* Patch the list of states at out to point to start. */
void
patch(Ptrlist *l, State *s)
{
Ptrlist *next;
for(; l; l=next){
next = l->next;
l->s = s;
}
}
/* Join the two lists l1 and l2, returning the combination. */
Ptrlist*
append(Ptrlist *l1, Ptrlist *l2)
{
Ptrlist *oldl1;
oldl1 = l1;
while(l1->next)
l1 = l1->next;
l1->next = l2;
return oldl1;
}
/*
* Convert postfix regular expression to NFA.
* Return start state.
*/
State*
post2nfa(char *postfix)
{
char *p;
Frag stack[1000], *stackp, e1, e2, e;
State *s;
// fprintf(stderr, "postfix: %s\n", postfix);
if(postfix == NULL)
return NULL;
#define push(s) *stackp++ = s
#define pop() *--stackp
stackp = stack;
for(p=postfix; *p; p++){
switch(*p){
default:
s = state(*p, NULL, NULL);
push(frag(s, list1(&s->out)));
break;
case '.': /* catenate */
e2 = pop();
e1 = pop();
patch(e1.out, e2.start);
push(frag(e1.start, e2.out));
break;
case '|': /* alternate */
e2 = pop();
e1 = pop();
s = state(Split, e1.start, e2.start);
push(frag(s, append(e1.out, e2.out)));
break;
case '?': /* zero or one */
e = pop();
s = state(Split, e.start, NULL);
push(frag(s, append(e.out, list1(&s->out1))));
break;
case '*': /* zero or more */
e = pop();
s = state(Split, e.start, NULL);
patch(e.out, s);
push(frag(s, list1(&s->out1)));
break;
case '+': /* one or more */
e = pop();
s = state(Split, e.start, NULL);
patch(e.out, s);
push(frag(e.start, list1(&s->out1)));
break;
}
}
e = pop();
if(stackp != stack)
return NULL;
patch(e.out, &matchstate);
return e.start;
#undef pop
#undef push
}
typedef struct List List;
struct List
{
State **s;
int n;
};
List l1, l2;
static int listid;
void addstate(List*, State*);
void step(List*, int, List*);
/* Compute initial state list */
List*
startlist(State *start, List *l)
{
l->n = 0;
listid++;
addstate(l, start);
return l;
}
/* Check whether state list contains a match. */
int
ismatch(List *l)
{
int i;
for(i=0; i<l->n; i++)
if(l->s[i] == &matchstate)
return 1;
return 0;
}
/* Add s to l, following unlabeled arrows. */
void
addstate(List *l, State *s)
{
if(s == NULL || s->lastlist == listid)
return;
s->lastlist = listid;
if(s->c == Split){
/* follow unlabeled arrows */
addstate(l, s->out);
addstate(l, s->out1);
return;
}
l->s[l->n++] = s;
}
/*
* Step the NFA from the states in clist
* past the character c,
* to create next NFA state set nlist.
*/
void
step(List *clist, int c, List *nlist)
{
int i;
State *s;
listid++;
nlist->n = 0;
for(i=0; i<clist->n; i++){
s = clist->s[i];
if(s->c == c)
addstate(nlist, s->out);
}
}
/* Run NFA to determine whether it matches s. */
int
match(State *start, char *s)
{
int i, c;
List *clist, *nlist, *t;
clist = startlist(start, &l1);
nlist = &l2;
for(; *s; s++){
c = *s & 0xFF;
step(clist, c, nlist);
t = clist; clist = nlist; nlist = t; /* swap clist, nlist */
}
return ismatch(clist);
}
int
main(int argc, char **argv)
{
int i;
char *post;
State *start;
if(argc < 3){
fprintf(stderr, "usage: nfa regexp string...\n");
return 1;
}
post = re2post(argv[1]);
if(post == NULL){
fprintf(stderr, "bad regexp %s\n", argv[1]);
return 1;
}
start = post2nfa(post);
if(start == NULL){
fprintf(stderr, "error in post2nfa %s\n", post);
return 1;
}
l1.s = malloc(nstate*sizeof l1.s[0]);
l2.s = malloc(nstate*sizeof l2.s[0]);
for(i=2; i<argc; i++)
if(match(start, argv[i]))
printf("%s\n", argv[i]);
return 0;
}
/*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the
* Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute,
* sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall
* be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
* KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
* PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
* OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment