Skip to content

Instantly share code, notes, and snippets.

@bcho
Created August 15, 2014 14:38
Show Gist options
  • Save bcho/8658b99090a777e15016 to your computer and use it in GitHub Desktop.
Save bcho/8658b99090a777e15016 to your computer and use it in GitHub Desktop.
A small nfa regex express engine written in Python.
# coding: utf-8
'''
Support operations:
- character match
- seq
- or
- repeatition
match_char = NFARE.from_char('a')
match_char.match('a') # => True
match_char.match('b') # => False
match_char.match('aa') # => False
a, b = NFARE.from_char('a'), NFARE.from_char('b')
ab = NFARE.rule_seq(a, b)
ab.match('ab') # => True
ab.match('ba') # => False
ab.match('aa') # => False
ab.match('abc') # => False
a, b = NFARE.from_char('a'), NFARE.from_char('b')
a_b = NFARE.rule_or(a, b)
a_b.match('a') # => True
a_b.match('b') # => True
a_b.match('ab') # => False
a_b.match('c') # => False
many_a = NFARE.rule_repeat(NFARE.from_char('a')))))
many_a.match('') # => True
many_a.match('a') # => True
many_a.match('a' * 100) # => True
a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c')
ab = NFARE.rule_seq(a, b)
many_ab = NFARE.rule_repeat(ab)
many_ab_or_c = NFARE.rule_or(many_ab, c)
many_ab_or_c.match('ab') # => True
many_ab_or_c.match('c') # => True
many_ab_or_c.match('ababab') # => True
many_ab_or_c.match('ccc') # => False
many_ab_or_c.match('abc') # => False
'''
from collections import defaultdict
class NFAState(object):
'''Represents a NFA state.
:param is_final: indicates if the state is final. Defaults to False.
'''
SUPPORTED_CHARSET = list(map(chr, range(128)))
def __init__(self, is_final=False):
# State type.
self.is_final = is_final
# Transition type.
self.on_chars = defaultdict(list)
self.on_empty = []
def add_char_edge(self, char, next_state):
'''Add a char edge to next state.
:param char: edge's char.
:param next_state: next state.
'''
if not isinstance(char, str):
char = str(char)
assert char in self.SUPPORTED_CHARSET
self.on_chars[char].append(next_state)
def add_empty_edge(self, next_state):
'''Add an empty edge to next state.
:param next_state: next state.
'''
self.on_empty.append(next_state)
def match(self, something, visited_states=None):
'''Match something.
:param something: just something.
:param visited_states: visited states.
'''
if visited_states is None:
visited_states = []
# No way out.
if self in visited_states:
return False
visited_states.append(self)
# End of something, check if we reach the final state.
if not something:
# Reach the final state.
if self.is_final:
return True
# See if we can reach the end through an empty edge.
return any([i.match('', visited_states) for i in self.on_empty])
# Eat the character and match the rest.
char = something[0]
for next_state in self.on_chars[char]:
if next_state.match(something[1:]):
return True
# Not works, try to transit to other state.
for next_state in self.on_empty:
if next_state.match(something, visited_states):
return True
# We have tried our best :/
return False
class NFARE(object):
'''Regex object.
:param start: start state.
:param end: end state.
'''
def __init__(self, start, end):
# Start & end state.
self.start_state, self.end_state = start, end
def match(self, something):
return self.start_state.match(something)
@staticmethod
def from_char(c):
return NFARE.rule_char(c)
@staticmethod
def rule_char(c):
assert isinstance(c, str)
start_state = NFAState()
end_state = NFAState(is_final=True)
start_state.add_char_edge(c, end_state)
return NFARE(start_state, end_state)
@staticmethod
def rule_empty():
start_state = NFAState()
end_state = NFAState(is_final=True)
start_state.add_empty_edge(end_state)
return NFARE(start_state, end_state)
@staticmethod
def _union_two_re(first, second):
assert isinstance(first, NFARE)
assert isinstance(second, NFARE)
first.end_state.is_final = False
second.end_state.is_final = True
first.end_state.add_empty_edge(second.start_state)
return NFARE(first.start_state, second.end_state)
@staticmethod
def rule_seq(*res):
assert len(res) >= 2
re = NFARE._union_two_re(res[0], res[1])
for next_re in res[2:]:
re = NFARE._union_two_re(re, next_re)
return re
@staticmethod
def _or_two_re(choice1, choice2):
assert isinstance(choice1, NFARE)
assert isinstance(choice2, NFARE)
choice1.end_state.is_final = False
choice2.end_state.is_final = False
start_state = NFAState()
end_state = NFAState(is_final=True)
start_state.add_empty_edge(choice1.start_state)
start_state.add_empty_edge(choice2.start_state)
choice1.end_state.add_empty_edge(end_state)
choice2.end_state.add_empty_edge(end_state)
return NFARE(start_state, end_state)
@staticmethod
def rule_or(*choices):
assert len(choices) >= 2
re = NFARE._or_two_re(choices[0], choices[1])
for choice in choices[2:]:
re = NFARE._or_two_re(re, choice)
return re
@staticmethod
def rule_repeat(re):
assert isinstance(re, NFARE)
re.end_state.add_empty_edge(re.start_state)
re.start_state.add_empty_edge(re.end_state)
return re
if __name__ == '__main__':
match_char = NFARE.from_char('a')
match_char.match('a') # => True
match_char.match('b') # => False
match_char.match('aa') # => False
a, b = NFARE.from_char('a'), NFARE.from_char('b')
ab = NFARE.rule_seq(a, b)
assert ab.match('ab') # => True
assert not ab.match('ba') # => False
assert not ab.match('aa') # => False
assert not ab.match('abc') # => False
a, b = NFARE.from_char('a'), NFARE.from_char('b')
a_b = NFARE.rule_or(a, b)
assert a_b.match('a') # => True
assert a_b.match('b') # => True
assert not a_b.match('ab') # => False
assert not a_b.match('c') # => False
many_a = NFARE.rule_repeat(NFARE.rule_char('a'))
assert many_a.match('') # => True
assert many_a.match('a') # => True
assert many_a.match('a' * 100) # => True
a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c')
ab = NFARE.rule_seq(a, b)
many_ab = NFARE.rule_repeat(ab)
many_ab_or_c = NFARE.rule_or(many_ab, c)
assert many_ab_or_c.match('ab') # => True
assert many_ab_or_c.match('c') # => True
assert many_ab_or_c.match('ababab' * 20) # => True
assert not many_ab_or_c.match('ccc') # => False
assert not many_ab_or_c.match('abc') # => False
print('All passed.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment