p7g/recompose.py

## recompose.py
"""
This is a really basic and dumb regexp engine to play with composable regular
expressions.

A Re object is basically just a pointer to some initial state and to some
accepting state. These states can each point to other states, forming an NFA-ε,
which can then be evaluated directly.

The cool thing is that you can just combine Re objects like this:

hello_re = literal("hello")
world_re = literal("world")
space_re = literal(" ") | literal("\t")
spaces_re = space_re + space_re.closure()
hello_world_re = hello_re + spaces_re + world_re

assert hello_world_re.evaluate("hello \t  world")
"""

from __future__ import annotations
from collections import defaultdict, deque


class _Epsilon:
	def __repr__(self) -> str:
		return "ε"


epsilon = _Epsilon()


class State:
	def __init__(self, transitions: dict[str | _Epsilon, list[State]]) -> None:
		self.transitions = defaultdict(list, transitions)

	def _copy(self, accepting_state_mapping, seen=None) -> State:
		if self is accepting_state_mapping[0]:
			return accepting_state_mapping[1]

		seen = seen or {}
		copy = State({})
		seen[self] = copy

		for input_, next_states in self.transitions.items():
			for next_state in next_states:
				if next_state in seen:
					next_state = seen[next_state]
				else:
					next_state = seen[next_state] = next_state._copy(accepting_state_mapping, seen)
				copy.transitions[input_].append(next_state)

		return copy

	def __repr__(self) -> str:
		return f"State({list(self.transitions.keys())})"


class Re:
	def __init__(self, initial_state: State, accepting_state: State):
		self.initial_state = initial_state
		self.accepting_state = accepting_state

	def evaluate(self, text: str) -> bool:
		states = {self.initial_state}

		for c in text:
			new_states: set[State] = set()

			work = deque(states)
			while work:
				state = work.popleft()
				if c in state.transitions:
					new_states.update(state.transitions[c])
				if epsilon in state.transitions:
					work.extend(state.transitions[epsilon])

			states = new_states

		work = deque(states)
		states = set()
		while work:
			state = work.popleft()
			if epsilon in state.transitions:
				work.extend(state.transitions[epsilon])
			else:
				states.add(state)

		return self.accepting_state in states

	def _copy(self) -> Re:
		accepting_state = State({})
		initial_state = self.initial_state._copy((self.accepting_state, accepting_state))
		return Re(initial_state, accepting_state)

	def __or__(self, other):
		if isinstance(other, str):
			other = literal(other)
		elif isinstance(other, Re):
			pass
		else:
			return NotImplemented

		return alternation(self, other)

	def __add__(self, other):
		if isinstance(other, str):
			other = literal(other)
		elif isinstance(other, Re):
			pass
		else:
			return NotImplemented

		return concatenation(self, other)

	def closure(self) -> Re:
		return closure(self)


def literal(text: str) -> Re:
	current_state = initial_state = State({})

	for c in text:
		next_state = State({})
		current_state.transitions[c] = [next_state]
		current_state = next_state

	return Re(initial_state, current_state)


def alternation(a: Re, b: Re) -> Re:
	a = a._copy()
	b = b._copy()

	initial_state = State(
		{
			epsilon: [
				a.initial_state,
				b.initial_state,
			]
		},
	)

	accepting_state = State({})
	a.accepting_state.transitions[epsilon] = [accepting_state]
	b.accepting_state.transitions[epsilon] = [accepting_state]

	return Re(initial_state, accepting_state)


def concatenation(a: Re, b: Re) -> Re:
	a = a._copy()
	b = b._copy()

	initial_state = State({epsilon: [a.initial_state]})
	a.accepting_state.transitions[epsilon] = [b.initial_state]

	accepting_state = State({})
	b.accepting_state.transitions[epsilon] = [accepting_state]

	return Re(initial_state, accepting_state)


def closure(a: Re) -> Re:
	a = a._copy()

	accepting_state = State({})
	initial_state = State({epsilon: [a.initial_state, accepting_state]})

	a.accepting_state.transitions[epsilon] = [accepting_state, a.initial_state]

	return Re(initial_state, accepting_state)
	"""
	This is a really basic and dumb regexp engine to play with composable regular
	expressions.

	A Re object is basically just a pointer to some initial state and to some
	accepting state. These states can each point to other states, forming an NFA-ε,
	which can then be evaluated directly.

	The cool thing is that you can just combine Re objects like this:

	hello_re = literal("hello")
	world_re = literal("world")
	space_re = literal(" ") \| literal("\t")
	spaces_re = space_re + space_re.closure()
	hello_world_re = hello_re + spaces_re + world_re

	assert hello_world_re.evaluate("hello \t world")
	"""

	from __future__ import annotations
	from collections import defaultdict, deque


	class _Epsilon:
	def __repr__(self) -> str:
	return "ε"


	epsilon = _Epsilon()


	class State:
	def __init__(self, transitions: dict[str \| _Epsilon, list[State]]) -> None:
	self.transitions = defaultdict(list, transitions)

	def _copy(self, accepting_state_mapping, seen=None) -> State:
	if self is accepting_state_mapping[0]:
	return accepting_state_mapping[1]

	seen = seen or {}
	copy = State({})
	seen[self] = copy

	for input_, next_states in self.transitions.items():
	for next_state in next_states:
	if next_state in seen:
	next_state = seen[next_state]
	else:
	next_state = seen[next_state] = next_state._copy(accepting_state_mapping, seen)
	copy.transitions[input_].append(next_state)

	return copy

	def __repr__(self) -> str:
	return f"State({list(self.transitions.keys())})"


	class Re:
	def __init__(self, initial_state: State, accepting_state: State):
	self.initial_state = initial_state
	self.accepting_state = accepting_state

	def evaluate(self, text: str) -> bool:
	states = {self.initial_state}

	for c in text:
	new_states: set[State] = set()

	work = deque(states)
	while work:
	state = work.popleft()
	if c in state.transitions:
	new_states.update(state.transitions[c])
	if epsilon in state.transitions:
	work.extend(state.transitions[epsilon])

	states = new_states

	work = deque(states)
	states = set()
	while work:
	state = work.popleft()
	if epsilon in state.transitions:
	work.extend(state.transitions[epsilon])
	else:
	states.add(state)

	return self.accepting_state in states

	def _copy(self) -> Re:
	accepting_state = State({})
	initial_state = self.initial_state._copy((self.accepting_state, accepting_state))
	return Re(initial_state, accepting_state)

	def __or__(self, other):
	if isinstance(other, str):
	other = literal(other)
	elif isinstance(other, Re):
	pass
	else:
	return NotImplemented

	return alternation(self, other)

	def __add__(self, other):
	if isinstance(other, str):
	other = literal(other)
	elif isinstance(other, Re):
	pass
	else:
	return NotImplemented

	return concatenation(self, other)

	def closure(self) -> Re:
	return closure(self)


	def literal(text: str) -> Re:
	current_state = initial_state = State({})

	for c in text:
	next_state = State({})
	current_state.transitions[c] = [next_state]
	current_state = next_state

	return Re(initial_state, current_state)


	def alternation(a: Re, b: Re) -> Re:
	a = a._copy()
	b = b._copy()

	initial_state = State(
	{
	epsilon: [
	a.initial_state,
	b.initial_state,
	]
	},
	)

	accepting_state = State({})
	a.accepting_state.transitions[epsilon] = [accepting_state]
	b.accepting_state.transitions[epsilon] = [accepting_state]

	return Re(initial_state, accepting_state)


	def concatenation(a: Re, b: Re) -> Re:
	a = a._copy()
	b = b._copy()

	initial_state = State({epsilon: [a.initial_state]})
	a.accepting_state.transitions[epsilon] = [b.initial_state]

	accepting_state = State({})
	b.accepting_state.transitions[epsilon] = [accepting_state]

	return Re(initial_state, accepting_state)


	def closure(a: Re) -> Re:
	a = a._copy()

	accepting_state = State({})
	initial_state = State({epsilon: [a.initial_state, accepting_state]})

	a.accepting_state.transitions[epsilon] = [accepting_state, a.initial_state]

	return Re(initial_state, accepting_state)