Skip to content

Instantly share code, notes, and snippets.

@ThiefMaster
Last active December 29, 2020 13:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ThiefMaster/4344896 to your computer and use it in GitHub Desktop.
Save ThiefMaster/4344896 to your computer and use it in GitHub Desktop.
import sys
class ParseError(ValueError):
pass
class Regex(object):
def __init__(self, tree):
self.tree = tree
def __str__(self):
return str(self.tree)
def __repr__(self):
return repr(self.tree)
def test(self, data, debug=False):
top = self.tree
if debug:
print ' == %r' % top
if not data:
# empty input => initial regex must accept epsilon
return top.accepts_epsilon
for char in data:
# create the derivates
top = top.derive(char)
if debug:
print '%s => %r' % (char, top)
if top.is_empty_grammar:
# abort early. it's a complete failure anyway
if debug:
print 'Empty grammar'
return False
if debug:
print 'Finished test loop'
return top.accepts_epsilon
class RegexObject(object):
accepts_epsilon = False
is_epsilon = False
is_empty_grammar = False
def __init__(self, content):
self.content = content
self.shallow = False
def __repr__(self):
return '%s(%r)' % (type(self).__name__, self.content)
def optimize(self):
if hasattr(self.content, 'optimize'):
self.content.optimize()
def derive(self, char):
raise NotImplementedError
class EMPTY_GRAMMAR(RegexObject):
accepts_epsilon = False
is_epsilon = False
is_empty_grammar = True
def __init__(self):
RegexObject.__init__(self, None)
def __repr__(self):
return 'EMPTY_GRAMMAR'
def derive(self, char):
# no chance to leave this state...
return EMPTY_GRAMMAR
EMPTY_GRAMMAR = EMPTY_GRAMMAR()
class EPSILON(RegexObject):
accepts_epsilon = True
is_epsilon = True
def __init__(self):
RegexObject.__init__(self, None)
def __repr__(self):
return 'EPSILON'
def __str__(self):
return '#'
def derive(self, char):
# epsilon => empty
return EMPTY_GRAMMAR
EPSILON = EPSILON()
class RegexList(RegexObject, list):
def __init__(self, *data):
self.content = self
self += data
def __repr__(self):
return '%s(%s)' % (
type(self).__name__,
', '.join(map(repr, self))
)
@property
def shallow(self):
return len(self) == 1
def optimize(self):
"""Replace single-element lists with their element"""
self[:] = (x[0] if x.shallow else x for x in self)
for x in self:
x.optimize()
class Seq(RegexList):
def __str__(self):
if self.shallow:
return str(self[0])
return '(%s)' % ''.join(map(str, self))
@staticmethod
def _concat(prefix, suffix):
# helper to create a new optimized sequence of teo items
if prefix.is_epsilon:
return suffix
elif suffix.is_epsilon:
return prefix
if prefix.is_empty_grammar or suffix.is_empty_grammar:
return EMPTY_GRAMMAR
return Seq(prefix, suffix)
@property
def accepts_epsilon(self):
# it accepts epsilon if all items accept epsilon
return all(x.accepts_epsilon for x in self)
@property
def is_epsilon(self):
# it's like epsilon if it only contains epsilons
return all(x.is_epsilon for x in self)
@property
def is_empty_grammar(self):
# empty grammar if any element is the empty grammar
return any(x.is_empty_grammar for x in self)
def derive(self, char):
# single-element list can be replaced with the element itself
if self.shallow:
return self[0].derive(char)
# split in (prefix, suffix...) for ease
prefix = self[0]
suffix = Seq(*self[1:]) if len(self) > 2 else self[1]
res = Seq._concat(prefix.derive(char), suffix)
if not prefix.accepts_epsilon:
return res
else:
return Choice._between(res, suffix.derive(char))
class Choice(RegexList):
def __str__(self):
return '(%s)' % '|'.join(map(str, self))
@staticmethod
def _between(first, second):
# helper to create a new optimized choice between two items
if first.is_empty_grammar:
return second
elif second.is_empty_grammar:
return first
return Choice(first, second)
@property
def accepts_epsilon(self):
# it accepts the empty string if any option accepts it
return any(x.accepts_epsilon for x in self)
@property
def is_epsilon(self):
# it's like epsilon if any option is epsilon
return any(x.is_epsilon for x in self)
@property
def is_empty_grammar(self):
# empty grammar if the choice consists of nothing but empty grammars
return all(x.is_empty_grammar for x in self)
def derive(self, char):
# (a | b)\c => a\c | b\c
derived = (x.derive(char) for x in self)
# get rid of empty grammars; we don't need them in a choice
return Choice(*(x for x in derived if not x.is_empty_grammar))
class Char(RegexObject):
def __str__(self):
return self.content
def __repr__(self):
return repr(self.content)
def derive(self, char):
return EPSILON if self.content == char else EMPTY_GRAMMAR
class Star(RegexObject):
accepts_epsilon = True
def __str__(self):
return str(self.content) + '*'
@property
def is_epsilon(self):
return self.content.is_epsilon
def derive(self, char):
# r* => r\c r*
return Seq._concat(self.content.derive(char), self)
def _parse(regex, _parens=False):
pos = 0
choice = None
current = Seq()
while pos < len(regex):
char = regex[pos]
if char == '(':
seq, offset = _parse(regex[pos + 1:], True)
pos += offset
current.append(seq)
elif char == ')' and _parens:
pos += 1
break
elif char == '|':
if choice:
choice.append(current)
else:
choice = Choice(current)
current = Seq()
elif char == '*':
current[-1] = Star(current[-1])
elif char.isalnum():
current.append(Char(char))
else:
raise ParseError("Unexpected: '%s'" % char)
pos += 1
if choice:
choice.append(current)
if any(c.is_epsilon for c in choice):
raise ParseError('Invalid choice')
return choice, pos
return current, pos
def parse(regex):
tree = _parse(regex)[0]
tree.optimize()
return Regex(tree)
if __name__ == '__main__':
if len(sys.argv) < 3:
print 'Usage: %s REGEX INPUT' % sys.argv[0]
sys.exit(1)
debug = len(sys.argv) == 4 and sys.argv[3] == '-d'
regex = sys.argv[1]
input = sys.argv[2]
tree = parse(regex)
if debug:
print 'Tree: %r' % tree
print 'Str: %s' % tree
print 'Orig: %s' % regex
print
print 'Trying to match with input "%s"' % input
print tree.test(input, debug)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment