Skip to content

Instantly share code, notes, and snippets.

@p7g
Created August 2, 2022 13:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save p7g/d2c99b185059b54f9fb41ebfe4734041 to your computer and use it in GitHub Desktop.
Save p7g/d2c99b185059b54f9fb41ebfe4734041 to your computer and use it in GitHub Desktop.
Python regular expressions that can be composed into bigger regular expressions
"""
This is a really basic and dumb regexp engine to play with composable regular
expressions.
A Re object is basically just a pointer to some initial state and to some
accepting state. These states can each point to other states, forming an NFA-ε,
which can then be evaluated directly.
The cool thing is that you can just combine Re objects like this:
hello_re = literal("hello")
world_re = literal("world")
space_re = literal(" ") | literal("\t")
spaces_re = space_re + space_re.closure()
hello_world_re = hello_re + spaces_re + world_re
assert hello_world_re.evaluate("hello \t world")
"""
from __future__ import annotations
from collections import defaultdict, deque
class _Epsilon:
def __repr__(self) -> str:
return "ε"
epsilon = _Epsilon()
class State:
def __init__(self, transitions: dict[str | _Epsilon, list[State]]) -> None:
self.transitions = defaultdict(list, transitions)
def _copy(self, accepting_state_mapping, seen=None) -> State:
if self is accepting_state_mapping[0]:
return accepting_state_mapping[1]
seen = seen or {}
copy = State({})
seen[self] = copy
for input_, next_states in self.transitions.items():
for next_state in next_states:
if next_state in seen:
next_state = seen[next_state]
else:
next_state = seen[next_state] = next_state._copy(accepting_state_mapping, seen)
copy.transitions[input_].append(next_state)
return copy
def __repr__(self) -> str:
return f"State({list(self.transitions.keys())})"
class Re:
def __init__(self, initial_state: State, accepting_state: State):
self.initial_state = initial_state
self.accepting_state = accepting_state
def evaluate(self, text: str) -> bool:
states = {self.initial_state}
for c in text:
new_states: set[State] = set()
work = deque(states)
while work:
state = work.popleft()
if c in state.transitions:
new_states.update(state.transitions[c])
if epsilon in state.transitions:
work.extend(state.transitions[epsilon])
states = new_states
work = deque(states)
states = set()
while work:
state = work.popleft()
if epsilon in state.transitions:
work.extend(state.transitions[epsilon])
else:
states.add(state)
return self.accepting_state in states
def _copy(self) -> Re:
accepting_state = State({})
initial_state = self.initial_state._copy((self.accepting_state, accepting_state))
return Re(initial_state, accepting_state)
def __or__(self, other):
if isinstance(other, str):
other = literal(other)
elif isinstance(other, Re):
pass
else:
return NotImplemented
return alternation(self, other)
def __add__(self, other):
if isinstance(other, str):
other = literal(other)
elif isinstance(other, Re):
pass
else:
return NotImplemented
return concatenation(self, other)
def closure(self) -> Re:
return closure(self)
def literal(text: str) -> Re:
current_state = initial_state = State({})
for c in text:
next_state = State({})
current_state.transitions[c] = [next_state]
current_state = next_state
return Re(initial_state, current_state)
def alternation(a: Re, b: Re) -> Re:
a = a._copy()
b = b._copy()
initial_state = State(
{
epsilon: [
a.initial_state,
b.initial_state,
]
},
)
accepting_state = State({})
a.accepting_state.transitions[epsilon] = [accepting_state]
b.accepting_state.transitions[epsilon] = [accepting_state]
return Re(initial_state, accepting_state)
def concatenation(a: Re, b: Re) -> Re:
a = a._copy()
b = b._copy()
initial_state = State({epsilon: [a.initial_state]})
a.accepting_state.transitions[epsilon] = [b.initial_state]
accepting_state = State({})
b.accepting_state.transitions[epsilon] = [accepting_state]
return Re(initial_state, accepting_state)
def closure(a: Re) -> Re:
a = a._copy()
accepting_state = State({})
initial_state = State({epsilon: [a.initial_state, accepting_state]})
a.accepting_state.transitions[epsilon] = [accepting_state, a.initial_state]
return Re(initial_state, accepting_state)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment