Skip to content

Instantly share code, notes, and snippets.

@pyrocat101
Created April 23, 2014 07:30
Show Gist options
  • Save pyrocat101/11205673 to your computer and use it in GitHub Desktop.
Save pyrocat101/11205673 to your computer and use it in GitHub Desktop.
Regex Parser (Python version)
from collections import deque
class SyntaxError(Exception):
pass
class RegexParser(object):
"""
Alt ::= Concat ('|' Alt)*
| Concat
Concat ::= Star Star*
Star ::= Prime '*'
| Prime
Prime ::= <<letters>> | <<digits>> | '(' Alt ')'
>>> RegexParser('a').parse()
['<<Lit>>', 'a']
>>> RegexParser('(ab)').parse()
['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]
>>> RegexParser('(ab)*').parse()
['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]]
>>> RegexParser('(ab)*(c|d)*(e|f)').parse()
['<<Concat>>', ['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]], ['<<Star>>', ['<<Alt>>', ['<<Lit>>', 'c'], ['<<Lit>>', 'd']]], ['<<Alt>>', ['<<Lit>>', 'e'], ['<<Lit>>', 'f']]]
"""
def __init__(self, input):
self.input = deque(input)
self.input.append('\0')
self.get_sym()
def get_sym(self):
self.sym = self.input.popleft()
def accept(self, s):
if self.sym == s:
self.get_sym()
return True
else:
return False
def expect(self, s):
if self.accept(s):
return True
else:
return SyntaxError
def alt(self):
A = [self.concat()]
while self.accept('|'):
A.append(self.concat())
if len(A) > 1:
return ["<<Alt>>"] + A
else:
return A[0]
def concat(self):
C = [self.star()]
while self.sym.isalnum() or self.sym is '(':
C.append(self.star())
if len(C) > 1:
return ["<<Concat>>"] + C
else:
return C[0]
def star(self):
P = [self.prime()]
if self.accept('*'):
return ["<<Star>>"] + P
else:
return P[0]
def prime(self):
if self.sym.isalnum():
c = self.sym
self.get_sym()
return ["<<Lit>>", c]
else:
self.expect('(')
r = self.alt()
self.expect(')')
return r
def parse(self):
tree = self.alt()
if self.sym is not '\0':
raise SyntaxError
return tree
def expect(s, c):
if s[0] is not c:
raise SyntaxError
else:
return s[1:]
def alt(s):
A, s = concat(s)
A = [A]
while s[0] == '|':
a, s = concat(s[1:])
A.append(a)
if len(A) > 1:
return (["<<Alt>>"] + A, s)
else:
return (A[0], s)
def concat(s):
C, s = star(s)
C = [C]
while s[0].isalnum() or s[0] is '(':
c, s = star(s)
C.append(c)
if len(C) > 1:
return (["<<Concat>>"] + C, s)
else:
return (C[0], s)
def star(s):
P, s = prime(s)
P = [P]
if s[0] == '*':
return (["<<Star>>"] + P, s[1:])
else:
return (P[0], s)
def prime(s):
if s[0].isalnum():
return (["<<Lit>>", s[0]], s[1:])
else:
s = expect(s, '(')
r, s = alt(s)
s = expect(s, ')')
return (r, s)
def parse(s):
"""
>>> parse('a')
['<<Lit>>', 'a']
>>> parse('(ab)')
['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]
>>> parse('(ab)*')
['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]]
>>> parse('(ab)*(c|d)*(e|f)')
['<<Concat>>', ['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]], ['<<Star>>', ['<<Alt>>', ['<<Lit>>', 'c'], ['<<Lit>>', 'd']]], ['<<Alt>>', ['<<Lit>>', 'e'], ['<<Lit>>', 'f']]]
"""
s = list(s) + ['\0']
tree, s = alt(s)
if s != ['\0']:
raise SyntaxError
else:
return tree
if __name__ == '__main__':
import doctest
doctest.testmod()
@wong2
Copy link

wong2 commented Apr 23, 2014

don't understand but feel sharp

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment