Last active
September 30, 2021 00:01
-
-
Save twfarland/940de76e750adc85b624a6eadcfc7f81 to your computer and use it in GitHub Desktop.
Incremental Construction of Minimal Acyclic Finite-State Automata (DAFSA aka DAWG) - Ocaml implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* An implementation of algorithm 1 from https://aclanthology.org/J00-1002.pdf *) | |
open Printf | |
module IntSet = Set.Make(Int) | |
module CharSet = Set.Make(Char) | |
module IntMap = Map.Make(Int) | |
module CharMap = Map.Make(Char) | |
module StringSet = Set.Make(String) | |
module StringMap = Map.Make(String) | |
type state = int | |
type states = IntSet.t | |
type word = char list | |
type alphabet = CharSet.t | |
type transitions = (state CharMap.t) IntMap.t | |
type register = state StringMap.t | |
type dafsa = | |
{ | |
ids : state Stream.t; | |
start : state; | |
states : states; | |
final : states; | |
alphabet : alphabet; | |
transitions : transitions; | |
register : register; | |
} | |
let rec common_prefix (t : transitions) (s : state) (w : word) : (state * word) = | |
match (IntMap.find_opt s t) with | |
| None -> (s, w) | |
| Some children -> match w with | |
| [] -> (s, w) | |
| c :: suffix -> match (CharMap.find_opt c children) with | |
| None -> (s, w) | |
| Some next -> common_prefix t next suffix | |
let has_children (d : dafsa) (s : state) : bool = | |
IntMap.mem s d.transitions | |
let get_state_key (d : dafsa) (s : state) : string = | |
let is_final = match (IntSet.find_opt s d.final) with | |
| None -> "0_" | |
| Some _ -> "1_" | |
in match (IntMap.find_opt s d.transitions) with | |
| None -> is_final | |
| Some children -> | |
let child_to_str k v acc = sprintf "%s %c_%d" acc k v | |
in CharMap.fold child_to_str children is_final | |
let replace (d : dafsa) (existing : state) (parent : state) (child : state) (c : char) : dafsa = | |
{ | |
d with | |
states = IntSet.remove child d.states; | |
final = IntSet.remove child d.final; | |
transitions = d.transitions | |
|> IntMap.remove child | |
|> IntMap.update parent (fun children -> | |
match children with | |
| None -> Some (CharMap.singleton c existing) | |
| Some siblings -> Some (CharMap.add c existing siblings)) | |
} | |
let register (d : dafsa) (child_key : string) (child : state) : dafsa = | |
{ d with register = StringMap.add child_key child d.register } | |
let replace_or_register (d : dafsa) (parent : state) (child : state) (c : char) : dafsa = | |
let child_key = get_state_key d child in | |
match (StringMap.find_opt child_key d.register) with | |
| Some existing -> replace d existing parent child c | |
| None -> register d child_key child | |
let rec minimize (d: dafsa) (s : state) : dafsa = | |
match (IntMap.find_opt s d.transitions) with | |
| None -> d | |
| Some children -> | |
match (CharMap.max_binding_opt children) with | |
| None -> d | |
| Some (c, child) -> | |
let to_modify = if has_children d child then minimize d child else d | |
in replace_or_register to_modify s child c | |
let rec add_suffix (d : dafsa) (s : state) (w : word) : dafsa = | |
match w with | |
| (c :: tail) -> | |
let child = Stream.next d.ids in | |
let with_child = { | |
d with | |
states = IntSet.add child d.states; | |
alphabet = CharSet.add c d.alphabet; | |
transitions = d.transitions | |
|> IntMap.update s (fun children -> | |
match children with | |
| Some siblings -> Some (CharMap.add c child siblings) | |
| None -> Some (CharMap.singleton c child)) | |
} | |
in add_suffix with_child child tail | |
| [] -> | |
{ d with final = IntSet.add s d.final } | |
let builder (w : word) (d : dafsa) : dafsa = | |
let (lastState, currentSuffix) = common_prefix d.transitions d.start w in | |
let minimized = if has_children d lastState then minimize d lastState else d | |
in add_suffix minimized lastState currentSuffix | |
let blank_dafsa () : dafsa = | |
let ids = Stream.from (fun count -> Some count) | |
in let start = Stream.next ids | |
in { | |
ids; | |
start; | |
states = IntSet.singleton start; | |
final = IntSet.empty; | |
alphabet = CharSet.empty; | |
transitions = IntMap.empty; | |
register = StringMap.empty; | |
} | |
let build (words : word Lwt_stream.t) : dafsa = | |
let seed = blank_dafsa () | |
in let built = Lwt_main.run (Lwt_stream.fold builder words seed) | |
in minimize built seed.start | |
(* example usage *) | |
let valid_char c = c != '\r' && c != '\n' && c != ' ' | |
let explode s = String.to_seq s |> List.of_seq |> List.filter valid_char | |
let implode l = List.to_seq l |> String.of_seq | |
let rec remove_first a ls = | |
match ls with | |
| [] -> [] | |
| (x :: xs) -> if x == a then xs else x :: (remove_first a xs) | |
let n_permutations (d : dafsa) (w : word) : StringSet.t = | |
let rec loop (to_match : word) (matched : string) (s :state) = | |
let ends_here = | |
if (IntSet.mem s d.final) | |
then StringSet.singleton matched | |
else StringSet.empty | |
in | |
let ends_lower = | |
match (IntMap.find_opt s d.transitions) with | |
| None -> StringSet.empty | |
| Some children -> | |
let fold perms c = | |
let rest = remove_first c to_match in | |
match (CharMap.find_opt c children) with | |
| None -> perms | |
| Some child -> | |
StringSet.union perms (loop rest (matched ^ (sprintf "%c" c)) child) | |
in List.fold_left fold StringSet.empty to_match | |
in StringSet.union ends_here ends_lower | |
in loop w "" d.start | |
let wordstream_of_file filename = | |
let open Lwt in | |
let open Lwt_io in | |
let stream = open_file filename ~mode:Input >>= fun input_channel -> | |
return (read_lines input_channel) >>= fun stream -> | |
return (Lwt_stream.map explode stream) | |
in Lwt_main.run stream | |
let print_transitions (d : dafsa) = | |
IntMap.iter (fun state children -> | |
CharMap.iter (fun c next -> | |
printf "%d %c %d\n" state c next | |
) children | |
) d.transitions | |
let print_register (d : dafsa) = | |
StringMap.iter (printf "%s -> %d\n") d.register | |
let () = | |
print_endline "Building dictionary..."; | |
let stream = wordstream_of_file "./collins_scrabble_2019.txt" | |
in let scrabble = build stream | |
in | |
printf | |
"Dictionary built with %d states, %d final states, %d transitions \n" | |
(IntSet.cardinal scrabble.states) | |
(IntSet.cardinal scrabble.final) | |
(IntMap.fold (fun _ children sum -> sum + (CharMap.cardinal children)) scrabble.transitions 0); | |
let rec loop () = | |
print_endline "Enter a word"; | |
let word = read_line () in | |
let perms = n_permutations scrabble (explode word) in | |
StringSet.iter print_endline perms; | |
printf "%d words found\n" (StringSet.cardinal perms); | |
loop () | |
in loop () | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment