Last active
December 29, 2020 13:09
-
-
Save ThiefMaster/4344896 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
class ParseError(ValueError): | |
pass | |
class Regex(object): | |
def __init__(self, tree): | |
self.tree = tree | |
def __str__(self): | |
return str(self.tree) | |
def __repr__(self): | |
return repr(self.tree) | |
def test(self, data, debug=False): | |
top = self.tree | |
if debug: | |
print ' == %r' % top | |
if not data: | |
# empty input => initial regex must accept epsilon | |
return top.accepts_epsilon | |
for char in data: | |
# create the derivates | |
top = top.derive(char) | |
if debug: | |
print '%s => %r' % (char, top) | |
if top.is_empty_grammar: | |
# abort early. it's a complete failure anyway | |
if debug: | |
print 'Empty grammar' | |
return False | |
if debug: | |
print 'Finished test loop' | |
return top.accepts_epsilon | |
class RegexObject(object): | |
accepts_epsilon = False | |
is_epsilon = False | |
is_empty_grammar = False | |
def __init__(self, content): | |
self.content = content | |
self.shallow = False | |
def __repr__(self): | |
return '%s(%r)' % (type(self).__name__, self.content) | |
def optimize(self): | |
if hasattr(self.content, 'optimize'): | |
self.content.optimize() | |
def derive(self, char): | |
raise NotImplementedError | |
class EMPTY_GRAMMAR(RegexObject): | |
accepts_epsilon = False | |
is_epsilon = False | |
is_empty_grammar = True | |
def __init__(self): | |
RegexObject.__init__(self, None) | |
def __repr__(self): | |
return 'EMPTY_GRAMMAR' | |
def derive(self, char): | |
# no chance to leave this state... | |
return EMPTY_GRAMMAR | |
EMPTY_GRAMMAR = EMPTY_GRAMMAR() | |
class EPSILON(RegexObject): | |
accepts_epsilon = True | |
is_epsilon = True | |
def __init__(self): | |
RegexObject.__init__(self, None) | |
def __repr__(self): | |
return 'EPSILON' | |
def __str__(self): | |
return '#' | |
def derive(self, char): | |
# epsilon => empty | |
return EMPTY_GRAMMAR | |
EPSILON = EPSILON() | |
class RegexList(RegexObject, list): | |
def __init__(self, *data): | |
self.content = self | |
self += data | |
def __repr__(self): | |
return '%s(%s)' % ( | |
type(self).__name__, | |
', '.join(map(repr, self)) | |
) | |
@property | |
def shallow(self): | |
return len(self) == 1 | |
def optimize(self): | |
"""Replace single-element lists with their element""" | |
self[:] = (x[0] if x.shallow else x for x in self) | |
for x in self: | |
x.optimize() | |
class Seq(RegexList): | |
def __str__(self): | |
if self.shallow: | |
return str(self[0]) | |
return '(%s)' % ''.join(map(str, self)) | |
@staticmethod | |
def _concat(prefix, suffix): | |
# helper to create a new optimized sequence of teo items | |
if prefix.is_epsilon: | |
return suffix | |
elif suffix.is_epsilon: | |
return prefix | |
if prefix.is_empty_grammar or suffix.is_empty_grammar: | |
return EMPTY_GRAMMAR | |
return Seq(prefix, suffix) | |
@property | |
def accepts_epsilon(self): | |
# it accepts epsilon if all items accept epsilon | |
return all(x.accepts_epsilon for x in self) | |
@property | |
def is_epsilon(self): | |
# it's like epsilon if it only contains epsilons | |
return all(x.is_epsilon for x in self) | |
@property | |
def is_empty_grammar(self): | |
# empty grammar if any element is the empty grammar | |
return any(x.is_empty_grammar for x in self) | |
def derive(self, char): | |
# single-element list can be replaced with the element itself | |
if self.shallow: | |
return self[0].derive(char) | |
# split in (prefix, suffix...) for ease | |
prefix = self[0] | |
suffix = Seq(*self[1:]) if len(self) > 2 else self[1] | |
res = Seq._concat(prefix.derive(char), suffix) | |
if not prefix.accepts_epsilon: | |
return res | |
else: | |
return Choice._between(res, suffix.derive(char)) | |
class Choice(RegexList): | |
def __str__(self): | |
return '(%s)' % '|'.join(map(str, self)) | |
@staticmethod | |
def _between(first, second): | |
# helper to create a new optimized choice between two items | |
if first.is_empty_grammar: | |
return second | |
elif second.is_empty_grammar: | |
return first | |
return Choice(first, second) | |
@property | |
def accepts_epsilon(self): | |
# it accepts the empty string if any option accepts it | |
return any(x.accepts_epsilon for x in self) | |
@property | |
def is_epsilon(self): | |
# it's like epsilon if any option is epsilon | |
return any(x.is_epsilon for x in self) | |
@property | |
def is_empty_grammar(self): | |
# empty grammar if the choice consists of nothing but empty grammars | |
return all(x.is_empty_grammar for x in self) | |
def derive(self, char): | |
# (a | b)\c => a\c | b\c | |
derived = (x.derive(char) for x in self) | |
# get rid of empty grammars; we don't need them in a choice | |
return Choice(*(x for x in derived if not x.is_empty_grammar)) | |
class Char(RegexObject): | |
def __str__(self): | |
return self.content | |
def __repr__(self): | |
return repr(self.content) | |
def derive(self, char): | |
return EPSILON if self.content == char else EMPTY_GRAMMAR | |
class Star(RegexObject): | |
accepts_epsilon = True | |
def __str__(self): | |
return str(self.content) + '*' | |
@property | |
def is_epsilon(self): | |
return self.content.is_epsilon | |
def derive(self, char): | |
# r* => r\c r* | |
return Seq._concat(self.content.derive(char), self) | |
def _parse(regex, _parens=False): | |
pos = 0 | |
choice = None | |
current = Seq() | |
while pos < len(regex): | |
char = regex[pos] | |
if char == '(': | |
seq, offset = _parse(regex[pos + 1:], True) | |
pos += offset | |
current.append(seq) | |
elif char == ')' and _parens: | |
pos += 1 | |
break | |
elif char == '|': | |
if choice: | |
choice.append(current) | |
else: | |
choice = Choice(current) | |
current = Seq() | |
elif char == '*': | |
current[-1] = Star(current[-1]) | |
elif char.isalnum(): | |
current.append(Char(char)) | |
else: | |
raise ParseError("Unexpected: '%s'" % char) | |
pos += 1 | |
if choice: | |
choice.append(current) | |
if any(c.is_epsilon for c in choice): | |
raise ParseError('Invalid choice') | |
return choice, pos | |
return current, pos | |
def parse(regex): | |
tree = _parse(regex)[0] | |
tree.optimize() | |
return Regex(tree) | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print 'Usage: %s REGEX INPUT' % sys.argv[0] | |
sys.exit(1) | |
debug = len(sys.argv) == 4 and sys.argv[3] == '-d' | |
regex = sys.argv[1] | |
input = sys.argv[2] | |
tree = parse(regex) | |
if debug: | |
print 'Tree: %r' % tree | |
print 'Str: %s' % tree | |
print 'Orig: %s' % regex | |
print 'Trying to match with input "%s"' % input | |
print tree.test(input, debug) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment