Last active
September 30, 2023 15:55
-
-
Save mildsunrise/0bec028a30d46d601b9236784093f155 to your computer and use it in GitHub Desktop.
Kleene's algorithm for regexes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import TypeVar, Optional | |
from collections import defaultdict | |
S = TypeVar('S') # state node | |
def kleene_algorithm(out_edges: dict[Optional[S], dict[Optional[S], str]]) -> str: | |
''' calculates a regular expression that is equivalent to the NFA given by `out_edges`. | |
`out_edges` describes the possible transitions, indexed first by source node and then by destination node. | |
a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any | |
regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε. | |
the regexes are assumed to be valid syntactically. | |
there are two virtual states, start and final. these are both referred by the special value `None`, depending | |
on if it appears in the source node or the destination node respectively. because of this scheme, the start | |
node can only have outgoing edges, and the final node can only have incoming edges. | |
if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised. | |
the result will be correct even if there are dead states, but the algorithm will be needlessly slower. | |
note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most) | |
a single edge from start to final. the function returns the regex of this edge, for convenience. | |
''' | |
# build the reversely-indexed dictionary, for efficiency, as the algorithm also has to look at | |
# the incoming edges of a node. we'll be careful to keep both dictionaries in sync. | |
in_edges = defaultdict(dict) | |
for a, edges in out_edges.items(): | |
for b, expr in edges.items(): | |
in_edges[b][a] = expr | |
# process all nodes except the 2 virtual ones, in an arbitrary order. | |
# the resulting regex size varies wildly depending on this order. | |
nodes = set(out_edges) - {None} | |
for node in nodes: | |
# 1. remove the node from the graph, but save its current incoming / outgoing edges | |
node_outputs = out_edges.pop(node) | |
node_inputs = in_edges.pop(node) | |
# 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions | |
loop = node_outputs.get(node) | |
loop = f'(?:{loop})*' if loop else '' | |
node_outputs.pop(node, None) | |
node_inputs.pop(node, None) | |
# 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop) | |
for n in node_outputs: | |
del in_edges[n][node] | |
for n in node_inputs: | |
del out_edges[n][node] | |
# 4. combine incoming edges with outgoing edges | |
for a, a_expr in node_inputs.items(): | |
for b, b_expr in node_outputs.items(): | |
# if there's already a transition between these 2 nodes, add a pipe (|) | |
expr = out_edges[a].get(b) | |
expr = expr + '|' if expr != None else '' | |
expr += f'(?:{a_expr})' + loop + f'(?:{b_expr})' | |
# add/update graph edge | |
out_edges[a][b] = in_edges[b][a] = expr | |
return out_edges[None][None] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::{collections::HashMap, fmt::Write, hash::Hash}; | |
/// calculates a regular expression that is equivalent to the NFA given by `out_edges`. | |
/// | |
/// `out_edges` describes the possible transitions, indexed first by source node and then by destination node. | |
/// a transition is an input symbol in the form of a regex, but it's not limited to a single symbol -- any | |
/// regex is accepted, even the empty regex `''`, meaning we accept something even more general than an NFA-ε. | |
/// the regexes are assumed to be valid syntactically. | |
/// | |
/// there are two virtual states, start and final. these are both referred by the special value `None`, depending | |
/// on if it appears in the source node or the destination node respectively. because of this scheme, the start | |
/// node can only have outgoing edges, and the final node can only have incoming edges. | |
/// | |
/// if the NFA accepts nothing, i.e. start and final are unconnected, KeyError is raised. | |
/// the result will be correct even if there are dead states, but the algorithm will be needlessly slower. | |
/// | |
/// note that the algorithm works in-place on `out_edges` and will reduce it to a graph with (at most) | |
/// a single edge from start to final. the function returns the regex of this edge, for convenience. | |
fn kleene<S: Eq + Hash + Copy>(out_edges: &mut HashMap<Option<S>, HashMap<Option<S>, String>>) -> &str { | |
// build the reversely-indexed graph, since the algorithm needs to look up incoming edges efficiently. | |
// we need to be careful to keep both in sync. | |
let mut in_edges = HashMap::<Option<S>, HashMap<Option<S>, String>>::new(); | |
for (&a, edges) in out_edges { | |
for (&b, expr) in edges { | |
in_edges.entry(b).or_default().insert(a, expr.clone()); | |
} | |
} | |
// process all nodes except the 2 virtual ones, in an arbitrary order. | |
// the resulting regex size varies wildly depending on this order. | |
while let Some((&node, _)) = out_edges | |
.iter() | |
.filter(|&(s, _)| s.is_some()) | |
// a good heuristic to reduce regex size seems to be picking node with least out-edges | |
.min_by_key(|&(_, edges)| edges.len()) | |
{ | |
// 1. remove the node from the graph, but save its current incoming / outgoing edges | |
let mut node_out_edges = out_edges.remove(&node).unwrap_or_default(); | |
let mut node_in_edges = in_edges.remove(&node).unwrap_or_default(); | |
// 2. if there's a looping edge, remove it and (in step 4) intermix it with all transitions | |
node_in_edges.remove(&node); | |
let loop_expr = node_out_edges | |
.remove(&node) | |
.map(|e| format!("(?:{e})*")) | |
.unwrap_or_default(); | |
// 3. finish removing the node from the graph (this is actually part of step 1, but it's simpler now that there's no loop) | |
for (s, _) in &node_out_edges { | |
in_edges.get_mut(s).unwrap().remove(&node); | |
} | |
for (s, _) in &node_in_edges { | |
out_edges.get_mut(s).unwrap().remove(&node); | |
} | |
// 4. combine incoming edges with outgoing edges | |
for (a, a_expr) in &node_in_edges { | |
for (b, b_expr) in &node_out_edges { | |
// if there's already a transition between these 2 nodes, add a pipe (|) | |
let mut expr = out_edges[a].get(b).map(|s| format!("{s}|")).unwrap_or_default(); | |
write!(expr, "(?:{a_expr}){loop_expr}(?:{b_expr})").unwrap(); | |
// add/update graph edge | |
out_edges.get_mut(a).unwrap().insert(*b, expr.clone()); | |
in_edges.get_mut(b).unwrap().insert(*a, expr); | |
} | |
} | |
} | |
return &out_edges[&None][&None]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment