Created
August 15, 2014 14:38
-
-
Save bcho/8658b99090a777e15016 to your computer and use it in GitHub Desktop.
A small nfa regex express engine written in Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
''' | |
Support operations: | |
- character match | |
- seq | |
- or | |
- repeatition | |
match_char = NFARE.from_char('a') | |
match_char.match('a') # => True | |
match_char.match('b') # => False | |
match_char.match('aa') # => False | |
a, b = NFARE.from_char('a'), NFARE.from_char('b') | |
ab = NFARE.rule_seq(a, b) | |
ab.match('ab') # => True | |
ab.match('ba') # => False | |
ab.match('aa') # => False | |
ab.match('abc') # => False | |
a, b = NFARE.from_char('a'), NFARE.from_char('b') | |
a_b = NFARE.rule_or(a, b) | |
a_b.match('a') # => True | |
a_b.match('b') # => True | |
a_b.match('ab') # => False | |
a_b.match('c') # => False | |
many_a = NFARE.rule_repeat(NFARE.from_char('a'))))) | |
many_a.match('') # => True | |
many_a.match('a') # => True | |
many_a.match('a' * 100) # => True | |
a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c') | |
ab = NFARE.rule_seq(a, b) | |
many_ab = NFARE.rule_repeat(ab) | |
many_ab_or_c = NFARE.rule_or(many_ab, c) | |
many_ab_or_c.match('ab') # => True | |
many_ab_or_c.match('c') # => True | |
many_ab_or_c.match('ababab') # => True | |
many_ab_or_c.match('ccc') # => False | |
many_ab_or_c.match('abc') # => False | |
''' | |
from collections import defaultdict | |
class NFAState(object): | |
'''Represents a NFA state. | |
:param is_final: indicates if the state is final. Defaults to False. | |
''' | |
SUPPORTED_CHARSET = list(map(chr, range(128))) | |
def __init__(self, is_final=False): | |
# State type. | |
self.is_final = is_final | |
# Transition type. | |
self.on_chars = defaultdict(list) | |
self.on_empty = [] | |
def add_char_edge(self, char, next_state): | |
'''Add a char edge to next state. | |
:param char: edge's char. | |
:param next_state: next state. | |
''' | |
if not isinstance(char, str): | |
char = str(char) | |
assert char in self.SUPPORTED_CHARSET | |
self.on_chars[char].append(next_state) | |
def add_empty_edge(self, next_state): | |
'''Add an empty edge to next state. | |
:param next_state: next state. | |
''' | |
self.on_empty.append(next_state) | |
def match(self, something, visited_states=None): | |
'''Match something. | |
:param something: just something. | |
:param visited_states: visited states. | |
''' | |
if visited_states is None: | |
visited_states = [] | |
# No way out. | |
if self in visited_states: | |
return False | |
visited_states.append(self) | |
# End of something, check if we reach the final state. | |
if not something: | |
# Reach the final state. | |
if self.is_final: | |
return True | |
# See if we can reach the end through an empty edge. | |
return any([i.match('', visited_states) for i in self.on_empty]) | |
# Eat the character and match the rest. | |
char = something[0] | |
for next_state in self.on_chars[char]: | |
if next_state.match(something[1:]): | |
return True | |
# Not works, try to transit to other state. | |
for next_state in self.on_empty: | |
if next_state.match(something, visited_states): | |
return True | |
# We have tried our best :/ | |
return False | |
class NFARE(object): | |
'''Regex object. | |
:param start: start state. | |
:param end: end state. | |
''' | |
def __init__(self, start, end): | |
# Start & end state. | |
self.start_state, self.end_state = start, end | |
def match(self, something): | |
return self.start_state.match(something) | |
@staticmethod | |
def from_char(c): | |
return NFARE.rule_char(c) | |
@staticmethod | |
def rule_char(c): | |
assert isinstance(c, str) | |
start_state = NFAState() | |
end_state = NFAState(is_final=True) | |
start_state.add_char_edge(c, end_state) | |
return NFARE(start_state, end_state) | |
@staticmethod | |
def rule_empty(): | |
start_state = NFAState() | |
end_state = NFAState(is_final=True) | |
start_state.add_empty_edge(end_state) | |
return NFARE(start_state, end_state) | |
@staticmethod | |
def _union_two_re(first, second): | |
assert isinstance(first, NFARE) | |
assert isinstance(second, NFARE) | |
first.end_state.is_final = False | |
second.end_state.is_final = True | |
first.end_state.add_empty_edge(second.start_state) | |
return NFARE(first.start_state, second.end_state) | |
@staticmethod | |
def rule_seq(*res): | |
assert len(res) >= 2 | |
re = NFARE._union_two_re(res[0], res[1]) | |
for next_re in res[2:]: | |
re = NFARE._union_two_re(re, next_re) | |
return re | |
@staticmethod | |
def _or_two_re(choice1, choice2): | |
assert isinstance(choice1, NFARE) | |
assert isinstance(choice2, NFARE) | |
choice1.end_state.is_final = False | |
choice2.end_state.is_final = False | |
start_state = NFAState() | |
end_state = NFAState(is_final=True) | |
start_state.add_empty_edge(choice1.start_state) | |
start_state.add_empty_edge(choice2.start_state) | |
choice1.end_state.add_empty_edge(end_state) | |
choice2.end_state.add_empty_edge(end_state) | |
return NFARE(start_state, end_state) | |
@staticmethod | |
def rule_or(*choices): | |
assert len(choices) >= 2 | |
re = NFARE._or_two_re(choices[0], choices[1]) | |
for choice in choices[2:]: | |
re = NFARE._or_two_re(re, choice) | |
return re | |
@staticmethod | |
def rule_repeat(re): | |
assert isinstance(re, NFARE) | |
re.end_state.add_empty_edge(re.start_state) | |
re.start_state.add_empty_edge(re.end_state) | |
return re | |
if __name__ == '__main__': | |
match_char = NFARE.from_char('a') | |
match_char.match('a') # => True | |
match_char.match('b') # => False | |
match_char.match('aa') # => False | |
a, b = NFARE.from_char('a'), NFARE.from_char('b') | |
ab = NFARE.rule_seq(a, b) | |
assert ab.match('ab') # => True | |
assert not ab.match('ba') # => False | |
assert not ab.match('aa') # => False | |
assert not ab.match('abc') # => False | |
a, b = NFARE.from_char('a'), NFARE.from_char('b') | |
a_b = NFARE.rule_or(a, b) | |
assert a_b.match('a') # => True | |
assert a_b.match('b') # => True | |
assert not a_b.match('ab') # => False | |
assert not a_b.match('c') # => False | |
many_a = NFARE.rule_repeat(NFARE.rule_char('a')) | |
assert many_a.match('') # => True | |
assert many_a.match('a') # => True | |
assert many_a.match('a' * 100) # => True | |
a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c') | |
ab = NFARE.rule_seq(a, b) | |
many_ab = NFARE.rule_repeat(ab) | |
many_ab_or_c = NFARE.rule_or(many_ab, c) | |
assert many_ab_or_c.match('ab') # => True | |
assert many_ab_or_c.match('c') # => True | |
assert many_ab_or_c.match('ababab' * 20) # => True | |
assert not many_ab_or_c.match('ccc') # => False | |
assert not many_ab_or_c.match('abc') # => False | |
print('All passed.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment