Skip to content

Instantly share code, notes, and snippets.

@mildsunrise
Last active September 30, 2023 15:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mildsunrise/0bec028a30d46d601b9236784093f155 to your computer and use it in GitHub Desktop.
Save mildsunrise/0bec028a30d46d601b9236784093f155 to your computer and use it in GitHub Desktop.
Kleene's algorithm for regexes
from typing import TypeVar, Optional
from collections import defaultdict
S = TypeVar('S') # state node
def kleene_algorithm(out_edges: dict[Optional[S], dict[Optional[S], str]]) -> str:
''' calculates a regular expression that is equivalent to the NFA given by `out_edges`.
`out_edges` describes the possible transitions, indexed first by source node and then by destination node.
a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any
regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε.
the regexes are assumed to be valid syntactically.
there are two virtual states, start and final. these are both referred by the special value `None`, depending
on if it appears in the source node or the destination node respectively. because of this scheme, the start
node can only have outgoing edges, and the final node can only have incoming edges.
if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised.
the result will be correct even if there are dead states, but the algorithm will be needlessly slower.
note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most)
a single edge from start to final. the function returns the regex of this edge, for convenience.
'''
# build the reversely-indexed dictionary, for efficiency, as the algorithm also has to look at
# the incoming edges of a node. we'll be careful to keep both dictionaries in sync.
in_edges = defaultdict(dict)
for a, edges in out_edges.items():
for b, expr in edges.items():
in_edges[b][a] = expr
# process all nodes except the 2 virtual ones, in an arbitrary order.
# the resulting regex size varies wildly depending on this order.
nodes = set(out_edges) - {None}
for node in nodes:
# 1. remove the node from the graph, but save its current incoming / outgoing edges
node_outputs = out_edges.pop(node)
node_inputs = in_edges.pop(node)
# 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions
loop = node_outputs.get(node)
loop = f'(?:{loop})*' if loop else ''
node_outputs.pop(node, None)
node_inputs.pop(node, None)
# 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop)
for n in node_outputs:
del in_edges[n][node]
for n in node_inputs:
del out_edges[n][node]
# 4. combine incoming edges with outgoing edges
for a, a_expr in node_inputs.items():
for b, b_expr in node_outputs.items():
# if there's already a transition between these 2 nodes, add a pipe (|)
expr = out_edges[a].get(b)
expr = expr + '|' if expr != None else ''
expr += f'(?:{a_expr})' + loop + f'(?:{b_expr})'
# add/update graph edge
out_edges[a][b] = in_edges[b][a] = expr
return out_edges[None][None]
use std::{collections::HashMap, fmt::Write, hash::Hash};
/// calculates a regular expression that is equivalent to the NFA given by `out_edges`.
///
/// `out_edges` describes the possible transitions, indexed first by source node and then by destination node.
/// a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any
/// regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε.
/// the regexes are assumed to be valid syntactically.
///
/// there are two virtual states, start and final. these are both referred by the special value `None`, depending
/// on if it appears in the source node or the destination node respectively. because of this scheme, the start
/// node can only have outgoing edges, and the final node can only have incoming edges.
///
/// if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised.
/// the result will be correct even if there are dead states, but the algorithm will be needlessly slower.
///
/// note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most)
/// a single edge from start to final. the function returns the regex of this edge, for convenience.
fn kleene<S: Eq + Hash + Copy>(out_edges: &mut HashMap<Option<S>, HashMap<Option<S>, String>>) -> &str {
// build the reversely-indexed graph, since the algorithm needs to look up incoming edges efficiently.
// we need to be careful to keep both in sync.
let mut in_edges = HashMap::<Option<S>, HashMap<Option<S>, String>>::new();
for (&a, edges) in out_edges {
for (&b, expr) in edges {
in_edges.entry(b).or_default().insert(a, expr.clone());
}
}
// process all nodes except the 2 virtual ones, in an arbitrary order.
// the resulting regex size varies wildly depending on this order.
while let Some((&node, _)) = out_edges
.iter()
.filter(|&(s, _)| s.is_some())
// a good heuristic to reduce regex size seems to be picking node with least out-edges
.min_by_key(|&(_, edges)| edges.len())
{
// 1. remove the node from the graph, but save its current incoming / outgoing edges
let mut node_out_edges = out_edges.remove(&node).unwrap_or_default();
let mut node_in_edges = in_edges.remove(&node).unwrap_or_default();
// 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions
node_in_edges.remove(&node);
let loop_expr = node_out_edges
.remove(&node)
.map(|e| format!("(?:{e})*"))
.unwrap_or_default();
// 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop)
for (s, _) in &node_out_edges {
in_edges.get_mut(s).unwrap().remove(&node);
}
for (s, _) in &node_in_edges {
out_edges.get_mut(s).unwrap().remove(&node);
}
// 4. combine incoming edges with outgoing edges
for (a, a_expr) in &node_in_edges {
for (b, b_expr) in &node_out_edges {
// if there's already a transition between these 2 nodes, add a pipe (|)
let mut expr = out_edges[a].get(b).map(|s| format!("{s}|")).unwrap_or_default();
write!(expr, "(?:{a_expr}){loop_expr}(?:{b_expr})").unwrap();
// add/update graph edge
out_edges.get_mut(a).unwrap().insert(*b, expr.clone());
in_edges.get_mut(b).unwrap().insert(*a, expr);
}
}
}
return &out_edges[&None][&None];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment