Created
April 23, 2014 07:30
-
-
Save pyrocat101/11205673 to your computer and use it in GitHub Desktop.
Regex Parser (Python version)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import deque | |
class SyntaxError(Exception): | |
pass | |
class RegexParser(object): | |
""" | |
Alt ::= Concat ('|' Alt)* | |
| Concat | |
Concat ::= Star Star* | |
Star ::= Prime '*' | |
| Prime | |
Prime ::= <<letters>> | <<digits>> | '(' Alt ')' | |
>>> RegexParser('a').parse() | |
['<<Lit>>', 'a'] | |
>>> RegexParser('(ab)').parse() | |
['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']] | |
>>> RegexParser('(ab)*').parse() | |
['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]] | |
>>> RegexParser('(ab)*(c|d)*(e|f)').parse() | |
['<<Concat>>', ['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]], ['<<Star>>', ['<<Alt>>', ['<<Lit>>', 'c'], ['<<Lit>>', 'd']]], ['<<Alt>>', ['<<Lit>>', 'e'], ['<<Lit>>', 'f']]] | |
""" | |
def __init__(self, input): | |
self.input = deque(input) | |
self.input.append('\0') | |
self.get_sym() | |
def get_sym(self): | |
self.sym = self.input.popleft() | |
def accept(self, s): | |
if self.sym == s: | |
self.get_sym() | |
return True | |
else: | |
return False | |
def expect(self, s): | |
if self.accept(s): | |
return True | |
else: | |
return SyntaxError | |
def alt(self): | |
A = [self.concat()] | |
while self.accept('|'): | |
A.append(self.concat()) | |
if len(A) > 1: | |
return ["<<Alt>>"] + A | |
else: | |
return A[0] | |
def concat(self): | |
C = [self.star()] | |
while self.sym.isalnum() or self.sym is '(': | |
C.append(self.star()) | |
if len(C) > 1: | |
return ["<<Concat>>"] + C | |
else: | |
return C[0] | |
def star(self): | |
P = [self.prime()] | |
if self.accept('*'): | |
return ["<<Star>>"] + P | |
else: | |
return P[0] | |
def prime(self): | |
if self.sym.isalnum(): | |
c = self.sym | |
self.get_sym() | |
return ["<<Lit>>", c] | |
else: | |
self.expect('(') | |
r = self.alt() | |
self.expect(')') | |
return r | |
def parse(self): | |
tree = self.alt() | |
if self.sym is not '\0': | |
raise SyntaxError | |
return tree | |
def expect(s, c): | |
if s[0] is not c: | |
raise SyntaxError | |
else: | |
return s[1:] | |
def alt(s): | |
A, s = concat(s) | |
A = [A] | |
while s[0] == '|': | |
a, s = concat(s[1:]) | |
A.append(a) | |
if len(A) > 1: | |
return (["<<Alt>>"] + A, s) | |
else: | |
return (A[0], s) | |
def concat(s): | |
C, s = star(s) | |
C = [C] | |
while s[0].isalnum() or s[0] is '(': | |
c, s = star(s) | |
C.append(c) | |
if len(C) > 1: | |
return (["<<Concat>>"] + C, s) | |
else: | |
return (C[0], s) | |
def star(s): | |
P, s = prime(s) | |
P = [P] | |
if s[0] == '*': | |
return (["<<Star>>"] + P, s[1:]) | |
else: | |
return (P[0], s) | |
def prime(s): | |
if s[0].isalnum(): | |
return (["<<Lit>>", s[0]], s[1:]) | |
else: | |
s = expect(s, '(') | |
r, s = alt(s) | |
s = expect(s, ')') | |
return (r, s) | |
def parse(s): | |
""" | |
>>> parse('a') | |
['<<Lit>>', 'a'] | |
>>> parse('(ab)') | |
['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']] | |
>>> parse('(ab)*') | |
['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]] | |
>>> parse('(ab)*(c|d)*(e|f)') | |
['<<Concat>>', ['<<Star>>', ['<<Concat>>', ['<<Lit>>', 'a'], ['<<Lit>>', 'b']]], ['<<Star>>', ['<<Alt>>', ['<<Lit>>', 'c'], ['<<Lit>>', 'd']]], ['<<Alt>>', ['<<Lit>>', 'e'], ['<<Lit>>', 'f']]] | |
""" | |
s = list(s) + ['\0'] | |
tree, s = alt(s) | |
if s != ['\0']: | |
raise SyntaxError | |
else: | |
return tree | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
don't understand but feel sharp