A very simple regular expression engine for learning purposes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define some special things | |
SPECIAL = '*|' | |
ALPHABET = '10' | |
EPSILON = None | |
# functions | |
def automaton_print(automaton): | |
"Neatly prints the automaton" | |
states, alphabet, start, final, transfer = automaton | |
print('STATES : ', states) | |
print('ALPHABET: ', alphabet) | |
print('START : ', start) | |
print('FINAL : ', final) | |
print('TABLE : ', '(state, symbol)--->{states}') | |
print('--------------------------------------') | |
for key, value in sorted(transfer.items()): | |
print(' ', key, ' ---- > ', value) | |
def merge_tables(table1, table2): | |
""" | |
Merge the transition tables of two NFAs while renaming the states | |
by appending _nfa1 and _nfa2 to the state names | |
""" | |
table = {(key[0]+'_nfa1', key[1]): {i+'_nfa1' for i in value} for key, value in table1.items()} | |
table.update({(key[0]+'_nfa2', key[1]): {i+'_nfa2' for i in value} for key, value in table2.items()}) | |
return table | |
def relabel_nfa(automaton): | |
""" | |
Relabels automaton states and transitions with | |
new numbers | |
""" | |
states, alphabet, start, final, table = automaton | |
state_relabels = {state: str(index + 1) for index, state in enumerate(states)} | |
new_states = {state_relabels[i] for i in states} | |
new_start = state_relabels[start] | |
new_final = {state_relabels[i] for i in final} | |
new_table = {(state_relabels[key[0]], key[1]): {state_relabels[i] for i in value} | |
for key, value in table.items()} | |
new_automaton = (new_states, alphabet, new_start, new_final, new_table) | |
return new_automaton | |
def concat(nfa1, nfa2): | |
"Concatenate the two nfas" | |
states1, _alphabet, start1, final1, table1 = nfa1 | |
states2, _alphabet, start2, final2, table2 = nfa2 | |
table = merge_tables(table1, table2) | |
states = {i+'_nfa1' for i in states1}.union({i+'_nfa2' for i in states2}) | |
new_state_name_map = {name: str(1 + index) for index, name in enumerate(states)} | |
new_state_name_map['end_nfa1'] = new_state_name_map['start_nfa2'] # These are the same states | |
new_state_name_map['start_nfa1'] = 'start' | |
new_state_name_map['end_nfa2'] = 'end' | |
new_states = {new_state_name_map[i] for i in states} | |
# Replace in table | |
new_table = {} | |
for key, value in table.items(): | |
new_key = (new_state_name_map[key[0]], key[1]) | |
if new_key not in new_table: | |
new_table[new_key] = {new_state_name_map[i] for i in value} | |
# Make the NFA | |
NFA = (new_states, ALPHABET, 'start', {'end'}, new_table) | |
return NFA | |
def union(nfa1, nfa2): | |
""" | |
Make union of the two nfa | |
""" | |
# Expand the nfas | |
states1, _alphabet, start1, final1, table1 = nfa1 | |
states2, _alphabet, start2, final2, table2 = nfa2 | |
table = merge_tables(table1, table2) | |
table.update({('start', EPSILON): {'start_nfa1', 'start_nfa2'}, | |
('end_nfa1', EPSILON): {'end'}, | |
('end_nfa2', EPSILON): {'end'}, | |
}) | |
states = {i+'_nfa1' for i in states1}.union({i+'_nfa2' for i in states2}) | |
new_state_name_map = {name: str(1 + index) for index, name in enumerate(states)} | |
new_state_name_map['start'] = 'start' | |
new_state_name_map['end'] = 'end' | |
states = states.union({'start', 'end'}) | |
new_states = {new_state_name_map[i] for i in states} | |
# Replace in table | |
new_table = {} | |
for key, value in table.items(): | |
new_key = (new_state_name_map[key[0]], key[1]) | |
if new_key not in new_table: | |
new_table[new_key] = {new_state_name_map[i] for i in value} | |
# Make the NFA | |
NFA = (new_states, ALPHABET, 'start', {'end'}, new_table) | |
return NFA | |
def kleen(nfa): | |
""" | |
Create kleen closure of this nfa | |
""" | |
states, _alphabet, start, final, table = nfa | |
new_state_name_map = {name: str(1 + index) for index, name in enumerate(states)} | |
new_states = {new_state_name_map[i] for i in states} | |
new_table = {(new_state_name_map[key[0]], key[1]): {new_state_name_map[i] for i in value} | |
for key, value in table.items()} | |
new_states = new_states.union({'start', 'end'}) | |
new_table.update({('start', EPSILON):{'end', new_state_name_map['start']}, | |
(new_state_name_map['end'], EPSILON):{'end', new_state_name_map['start']}, | |
}) | |
NFA = (new_states, ALPHABET, 'start', {'end'}, new_table) | |
return NFA | |
def make_nfa(pattern): | |
""" | |
Make an NFA from a pattern | |
""" | |
if isinstance(pattern, list): | |
pattern = ''.join(pattern) | |
assert all([i in ALPHABET + SPECIAL for i in pattern]) # everything is in ALPHABET + SPECIAL | |
if len(pattern) > 1: | |
head, *tail = pattern | |
if tail[0] not in SPECIAL: | |
NFA = concat(make_nfa(head), make_nfa(tail)) | |
else: | |
symbol, *tail = tail | |
if symbol == '|': | |
other, *tail = tail | |
union_nfa = union(make_nfa(head), make_nfa(other)) | |
NFA = concat(union_nfa, make_nfa(tail)) | |
if symbol == '*': | |
NFA = concat(kleen(make_nfa(head)), | |
make_nfa(tail)) | |
elif len(pattern) == 1: # single match | |
states = {'start', 'end'} | |
start = 'start' | |
final = 'end' | |
table = {('start', pattern): {'end'}} | |
NFA = (states, ALPHABET, start, final, table) | |
else: # empty transition | |
states = {'start', 'end'} | |
start = 'start' | |
final = 'end' | |
table = {('start', EPSILON): {'end'}} | |
NFA = (states, ALPHABET, start, final, table) | |
return NFA | |
def epsilon_closure(state, table): | |
""" | |
find epsilon closure of a given state | |
using the given transition table | |
""" | |
done, not_done = [], [state] | |
# Build a tree with this state as root | |
while not_done: | |
to_do = [] | |
for s in not_done: | |
epsilon_from_s = {tuple(v) for k, v in table.items() if ((k[0] == s) and (k[1] is None))} | |
[to_do.append(i) for j in epsilon_from_s for i in j if i not in done] | |
done.append(s) | |
not_done = to_do | |
return set(done) | |
def to_nfa_without_epsilon_transitions(nfa): | |
"""Removes epsilon transition from nfa""" | |
states, alphabet, start, final, table = nfa | |
closure_map = {i: frozenset(epsilon_closure(i, table)) for i in states} | |
new_states = {closure_map[i] for i in states} | |
new_start = closure_map[start] | |
new_final = {i for i in closure_map.values() if any(((f in i) for f in final))} | |
new_table = {} | |
for new_state in new_states: | |
for symbol in alphabet: | |
key = (new_state, symbol) | |
target = set() | |
for member in new_state: | |
if (member, symbol) in table.keys(): | |
member_targets = table[(member, symbol)] | |
new_targets = {i for i in new_states for mt in member_targets if mt in i} | |
target = target.union(new_targets) | |
if len(target) > 0: | |
new_table[key] = target | |
# Relabel to avoid frozenset floods | |
NFA = (new_states, alphabet, new_start, new_final, new_table) | |
NFA = relabel_nfa(NFA) | |
return NFA | |
def remove_unreachable(nfa): | |
""" | |
Removes unreachable states and transitions | |
from given nfa | |
""" | |
states, alphabet, start, final, table = nfa | |
new_states, new_final, new_table = set(), set(), dict() | |
done, to_do = set(), set([start]) | |
while to_do: | |
targets = set() | |
for state in to_do - done: | |
for symbol in alphabet: | |
key = (state, symbol) | |
if key in table.keys(): | |
new_table[key] = table[key] | |
targets = targets.union(table[key]) | |
done.add(state) | |
to_do = targets | |
new_states = states.intersection(done) | |
new_final = {i for i in final if i in done} | |
NFA = (new_states, alphabet, start, new_final, new_table) | |
return NFA | |
def to_dfa(nfa): | |
""" | |
Convert given NFA to DFA | |
NFA must be non epsilon move | |
""" | |
states, alphabet, start, final, table = nfa | |
new_table = dict() | |
# We create states starting from the start state | |
states_done, states_to_do = set(), [{start}] | |
while states_to_do: | |
target_list = [] | |
for current_states in states_to_do: | |
for symbol in alphabet: | |
target_states = set() | |
for state in current_states: | |
key = (state, symbol) | |
if key in table: | |
targets = table[key] | |
target_states = target_states.union(targets) | |
if (target_states not in states_done) or\ | |
((frozenset(current_states), symbol) not in new_table.keys()): | |
if len(target_states) > 0: | |
new_table[(frozenset(current_states), symbol)] = target_states | |
states_done.add(frozenset(current_states)) | |
if len(target_states) > 0: | |
target_list.append(frozenset(target_states)) | |
states_to_do = set(target_list) - set(states_done) | |
# start and end states | |
new_start = frozenset({start}) | |
new_states = states_done | |
new_final = {i for i in new_states if any(((f in i) for f in final))} | |
state_relabels = {frozenset(state): str(index + 1) for index, state in enumerate(new_states)} | |
new_states = {state_relabels[i] for i in new_states} | |
new_start = state_relabels[new_start] | |
new_final = {state_relabels[i] for i in new_final} | |
new_table = {(state_relabels[key[0]], key[1]): state_relabels[frozenset(value)] | |
for key, value in new_table.items()} | |
DFA = (new_states, alphabet, new_start, new_final, new_table) | |
return DFA | |
def run_dfa(dfa, string): | |
""" | |
Runs the given DFA on the given input string | |
""" | |
states, alphabet, start, final, table = dfa | |
current_state = start | |
for symbol in string: | |
if symbol not in alphabet: | |
return False | |
key = (current_state, symbol) | |
try: | |
current_state = table[key] | |
except KeyError: | |
return False | |
return current_state in final | |
if __name__ == '__main__': | |
pattern = '0|10*' | |
print(pattern) | |
automaton = make_nfa(pattern) | |
automaton = to_nfa_without_epsilon_transitions(automaton) | |
automaton = remove_unreachable(automaton) | |
automaton = to_dfa(automaton) | |
automaton_print(automaton) | |
strings_to_run_on = ['1', | |
'0', | |
'01', | |
'100', | |
'11111' | |
] | |
for string in strings_to_run_on: | |
print(string, run_dfa(automaton, string)) | |
def test(): | |
"Test stuff" | |
def test_epsilon_closure(): | |
state = '1' | |
table = {('1', None): {'2', '3', '4'}, | |
('2', '1'): {'5'}, | |
('3', '1'): {'4'}, | |
('4', None): {'6', '7'}, | |
('4', '1'): {'9'}, | |
('6', None): {'2', '8'}, | |
('7', None): {'1'}, | |
('8', None): {'3'} | |
} | |
x = (epsilon_closure(state, table)) | |
assert set(x) == {'1', '2', '3', '4', '6', '7', '8'} | |
print('Epsilon closure working ok') | |
# Run the tests | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment