mildsunrise/kleene.py

## kleene.py
from typing import TypeVar, Optional
from collections import defaultdict

S = TypeVar('S')  # state node

def kleene_algorithm(out_edges: dict[Optional[S], dict[Optional[S], str]]) -> str:
	''' calculates a regular expression that is equivalent to the NFA given by `out_edges`.

	`out_edges` describes the possible transitions, indexed first by source node and then by destination node.
	a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any
	regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε.
	the regexes are assumed to be valid syntactically.

	there are two virtual states, start and final. these are both referred by the special value `None`, depending
	on if it appears in the source node or the destination node respectively. because of this scheme, the start
	node can only have outgoing edges, and the final node can only have incoming edges.

	if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised.
	the result will be correct even if there are dead states, but the algorithm will be needlessly slower.

	note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most)
	a single edge from start to final. the function returns the regex of this edge, for convenience.
	'''

	# build the reversely-indexed dictionary, for efficiency, as the algorithm also has to look at
	# the incoming edges of a node. we'll be careful to keep both dictionaries in sync.
	in_edges = defaultdict(dict)
	for a, edges in out_edges.items():
		for b, expr in edges.items():
			in_edges[b][a] = expr

	# process all nodes except the 2 virtual ones, in an arbitrary order.
	# the resulting regex size varies wildly depending on this order.
	nodes = set(out_edges) - {None}

	for node in nodes:
		# 1. remove the node from the graph, but save its current incoming / outgoing edges
		node_outputs = out_edges.pop(node)
		node_inputs = in_edges.pop(node)

		# 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions
		loop = node_outputs.get(node)
		loop = f'(?:{loop})*' if loop else ''
		node_outputs.pop(node, None)
		node_inputs.pop(node, None)

		# 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop)
		for n in node_outputs:
			del in_edges[n][node]
		for n in node_inputs:
			del out_edges[n][node]

		# 4. combine incoming edges with outgoing edges
		for a, a_expr in node_inputs.items():
			for b, b_expr in node_outputs.items():
				# if there's already a transition between these 2 nodes, add a pipe (|)
				expr = out_edges[a].get(b)
				expr = expr + '|' if expr != None else ''
				expr += f'(?:{a_expr})' + loop + f'(?:{b_expr})'
				# add/update graph edge
				out_edges[a][b] = in_edges[b][a] = expr

	return out_edges[None][None]

## kleene.rs
use std::{collections::HashMap, fmt::Write, hash::Hash};

/// calculates a regular expression that is equivalent to the NFA given by `out_edges`.
///
/// `out_edges` describes the possible transitions, indexed first by source node and then by destination node.
/// a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any
/// regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε.
/// the regexes are assumed to be valid syntactically.
///
/// there are two virtual states, start and final. these are both referred by the special value `None`, depending
/// on if it appears in the source node or the destination node respectively. because of this scheme, the start
/// node can only have outgoing edges, and the final node can only have incoming edges.
///
/// if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised.
/// the result will be correct even if there are dead states, but the algorithm will be needlessly slower.
///
/// note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most)
/// a single edge from start to final. the function returns the regex of this edge, for convenience.

fn kleene<S: Eq + Hash + Copy>(out_edges: &mut HashMap<Option<S>, HashMap<Option<S>, String>>) -> &str {
	// build the reversely-indexed graph, since the algorithm needs to look up incoming edges efficiently.
	// we need to be careful to keep both in sync.
	let mut in_edges = HashMap::<Option<S>, HashMap<Option<S>, String>>::new();
	for (&a, edges) in out_edges {
		for (&b, expr) in edges {
			in_edges.entry(b).or_default().insert(a, expr.clone());
		}
	}

	// process all nodes except the 2 virtual ones, in an arbitrary order.
	// the resulting regex size varies wildly depending on this order.
	while let Some((&node, _)) = out_edges
		.iter()
		.filter(|&(s, _)| s.is_some())
		// a good heuristic to reduce regex size seems to be picking node with least out-edges
		.min_by_key(|&(_, edges)| edges.len())
	{
		// 1. remove the node from the graph, but save its current incoming / outgoing edges
		let mut node_out_edges = out_edges.remove(&node).unwrap_or_default();
		let mut node_in_edges = in_edges.remove(&node).unwrap_or_default();

		// 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions
		node_in_edges.remove(&node);
		let loop_expr = node_out_edges
			.remove(&node)
			.map(|e| format!("(?:{e})*"))
			.unwrap_or_default();

		// 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop)
		for (s, _) in &node_out_edges {
			in_edges.get_mut(s).unwrap().remove(&node);
		}
		for (s, _) in &node_in_edges {
			out_edges.get_mut(s).unwrap().remove(&node);
		}

		// 4. combine incoming edges with outgoing edges
		for (a, a_expr) in &node_in_edges {
			for (b, b_expr) in &node_out_edges {
				// if there's already a transition between these 2 nodes, add a pipe (|)
				let mut expr = out_edges[a].get(b).map(|s| format!("{s}|")).unwrap_or_default();
				write!(expr, "(?:{a_expr}){loop_expr}(?:{b_expr})").unwrap();
				// add/update graph edge
				out_edges.get_mut(a).unwrap().insert(*b, expr.clone());
				in_edges.get_mut(b).unwrap().insert(*a, expr);
			}
		}
	}

	return &out_edges[&None][&None];
}
	from typing import TypeVar, Optional
	from collections import defaultdict

	S = TypeVar('S') # state node

	def kleene_algorithm(out_edges: dict[Optional[S], dict[Optional[S], str]]) -> str:
	''' calculates a regular expression that is equivalent to the NFA given by `out_edges`.

	`out_edges` describes the possible transitions, indexed first by source node and then by destination node.
	a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any
	regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε.
	the regexes are assumed to be valid syntactically.

	there are two virtual states, start and final. these are both referred by the special value `None`, depending
	on if it appears in the source node or the destination node respectively. because of this scheme, the start
	node can only have outgoing edges, and the final node can only have incoming edges.

	if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised.
	the result will be correct even if there are dead states, but the algorithm will be needlessly slower.

	note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most)
	a single edge from start to final. the function returns the regex of this edge, for convenience.
	'''

	# build the reversely-indexed dictionary, for efficiency, as the algorithm also has to look at
	# the incoming edges of a node. we'll be careful to keep both dictionaries in sync.
	in_edges = defaultdict(dict)
	for a, edges in out_edges.items():
	for b, expr in edges.items():
	in_edges[b][a] = expr

	# process all nodes except the 2 virtual ones, in an arbitrary order.
	# the resulting regex size varies wildly depending on this order.
	nodes = set(out_edges) - {None}

	for node in nodes:
	# 1. remove the node from the graph, but save its current incoming / outgoing edges
	node_outputs = out_edges.pop(node)
	node_inputs = in_edges.pop(node)

	# 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions
	loop = node_outputs.get(node)
	loop = f'(?:{loop})*' if loop else ''
	node_outputs.pop(node, None)
	node_inputs.pop(node, None)

	# 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop)
	for n in node_outputs:
	del in_edges[n][node]
	for n in node_inputs:
	del out_edges[n][node]

	# 4. combine incoming edges with outgoing edges
	for a, a_expr in node_inputs.items():
	for b, b_expr in node_outputs.items():
	# if there's already a transition between these 2 nodes, add a pipe (\|)
	expr = out_edges[a].get(b)
	expr = expr + '\|' if expr != None else ''
	expr += f'(?:{a_expr})' + loop + f'(?:{b_expr})'
	# add/update graph edge
	out_edges[a][b] = in_edges[b][a] = expr

	return out_edges[None][None]
	use std::{collections::HashMap, fmt::Write, hash::Hash};

	/// calculates a regular expression that is equivalent to the NFA given by `out_edges`.
	///
	/// `out_edges` describes the possible transitions, indexed first by source node and then by destination node.
	/// a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any
	/// regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε.
	/// the regexes are assumed to be valid syntactically.
	///
	/// there are two virtual states, start and final. these are both referred by the special value `None`, depending
	/// on if it appears in the source node or the destination node respectively. because of this scheme, the start
	/// node can only have outgoing edges, and the final node can only have incoming edges.
	///
	/// if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised.
	/// the result will be correct even if there are dead states, but the algorithm will be needlessly slower.
	///
	/// note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most)
	/// a single edge from start to final. the function returns the regex of this edge, for convenience.

	fn kleene<S: Eq + Hash + Copy>(out_edges: &mut HashMap<Option<S>, HashMap<Option<S>, String>>) -> &str {
	// build the reversely-indexed graph, since the algorithm needs to look up incoming edges efficiently.
	// we need to be careful to keep both in sync.
	let mut in_edges = HashMap::<Option<S>, HashMap<Option<S>, String>>::new();
	for (&a, edges) in out_edges {
	for (&b, expr) in edges {
	in_edges.entry(b).or_default().insert(a, expr.clone());
	}
	}

	// process all nodes except the 2 virtual ones, in an arbitrary order.
	// the resulting regex size varies wildly depending on this order.
	while let Some((&node, _)) = out_edges
	.iter()
	.filter(\|&(s, _)\| s.is_some())
	// a good heuristic to reduce regex size seems to be picking node with least out-edges
	.min_by_key(\|&(_, edges)\| edges.len())
	{
	// 1. remove the node from the graph, but save its current incoming / outgoing edges
	let mut node_out_edges = out_edges.remove(&node).unwrap_or_default();
	let mut node_in_edges = in_edges.remove(&node).unwrap_or_default();

	// 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions
	node_in_edges.remove(&node);
	let loop_expr = node_out_edges
	.remove(&node)
	.map(\|e\| format!("(?:{e})*"))
	.unwrap_or_default();

	// 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop)
	for (s, _) in &node_out_edges {
	in_edges.get_mut(s).unwrap().remove(&node);
	}
	for (s, _) in &node_in_edges {
	out_edges.get_mut(s).unwrap().remove(&node);
	}

	// 4. combine incoming edges with outgoing edges
	for (a, a_expr) in &node_in_edges {
	for (b, b_expr) in &node_out_edges {
	// if there's already a transition between these 2 nodes, add a pipe (\|)
	let mut expr = out_edges[a].get(b).map(\|s\| format!("{s}\|")).unwrap_or_default();
	write!(expr, "(?:{a_expr}){loop_expr}(?:{b_expr})").unwrap();
	// add/update graph edge
	out_edges.get_mut(a).unwrap().insert(*b, expr.clone());
	in_edges.get_mut(b).unwrap().insert(*a, expr);
	}
	}
	}

	return &out_edges[&None][&None];
	}