Skip to content

Instantly share code, notes, and snippets.

@twfarland
Last active September 30, 2021 00:01
Show Gist options
  • Save twfarland/940de76e750adc85b624a6eadcfc7f81 to your computer and use it in GitHub Desktop.
Save twfarland/940de76e750adc85b624a6eadcfc7f81 to your computer and use it in GitHub Desktop.
Incremental Construction of Minimal Acyclic Finite-State Automata (DAFSA aka DAWG) - Ocaml implementation
(* An implementation of algorithm 1 from https://aclanthology.org/J00-1002.pdf *)
open Printf
module IntSet = Set.Make(Int)
module CharSet = Set.Make(Char)
module IntMap = Map.Make(Int)
module CharMap = Map.Make(Char)
module StringSet = Set.Make(String)
module StringMap = Map.Make(String)
type state = int
type states = IntSet.t
type word = char list
type alphabet = CharSet.t
type transitions = (state CharMap.t) IntMap.t
type register = state StringMap.t
type dafsa =
{
ids : state Stream.t;
start : state;
states : states;
final : states;
alphabet : alphabet;
transitions : transitions;
register : register;
}
let rec common_prefix (t : transitions) (s : state) (w : word) : (state * word) =
match (IntMap.find_opt s t) with
| None -> (s, w)
| Some children -> match w with
| [] -> (s, w)
| c :: suffix -> match (CharMap.find_opt c children) with
| None -> (s, w)
| Some next -> common_prefix t next suffix
let has_children (d : dafsa) (s : state) : bool =
IntMap.mem s d.transitions
let get_state_key (d : dafsa) (s : state) : string =
let is_final = match (IntSet.find_opt s d.final) with
| None -> "0_"
| Some _ -> "1_"
in match (IntMap.find_opt s d.transitions) with
| None -> is_final
| Some children ->
let child_to_str k v acc = sprintf "%s %c_%d" acc k v
in CharMap.fold child_to_str children is_final
let replace (d : dafsa) (existing : state) (parent : state) (child : state) (c : char) : dafsa =
{
d with
states = IntSet.remove child d.states;
final = IntSet.remove child d.final;
transitions = d.transitions
|> IntMap.remove child
|> IntMap.update parent (fun children ->
match children with
| None -> Some (CharMap.singleton c existing)
| Some siblings -> Some (CharMap.add c existing siblings))
}
let register (d : dafsa) (child_key : string) (child : state) : dafsa =
{ d with register = StringMap.add child_key child d.register }
let replace_or_register (d : dafsa) (parent : state) (child : state) (c : char) : dafsa =
let child_key = get_state_key d child in
match (StringMap.find_opt child_key d.register) with
| Some existing -> replace d existing parent child c
| None -> register d child_key child
let rec minimize (d: dafsa) (s : state) : dafsa =
match (IntMap.find_opt s d.transitions) with
| None -> d
| Some children ->
match (CharMap.max_binding_opt children) with
| None -> d
| Some (c, child) ->
let to_modify = if has_children d child then minimize d child else d
in replace_or_register to_modify s child c
let rec add_suffix (d : dafsa) (s : state) (w : word) : dafsa =
match w with
| (c :: tail) ->
let child = Stream.next d.ids in
let with_child = {
d with
states = IntSet.add child d.states;
alphabet = CharSet.add c d.alphabet;
transitions = d.transitions
|> IntMap.update s (fun children ->
match children with
| Some siblings -> Some (CharMap.add c child siblings)
| None -> Some (CharMap.singleton c child))
}
in add_suffix with_child child tail
| [] ->
{ d with final = IntSet.add s d.final }
let builder (w : word) (d : dafsa) : dafsa =
let (lastState, currentSuffix) = common_prefix d.transitions d.start w in
let minimized = if has_children d lastState then minimize d lastState else d
in add_suffix minimized lastState currentSuffix
let blank_dafsa () : dafsa =
let ids = Stream.from (fun count -> Some count)
in let start = Stream.next ids
in {
ids;
start;
states = IntSet.singleton start;
final = IntSet.empty;
alphabet = CharSet.empty;
transitions = IntMap.empty;
register = StringMap.empty;
}
let build (words : word Lwt_stream.t) : dafsa =
let seed = blank_dafsa ()
in let built = Lwt_main.run (Lwt_stream.fold builder words seed)
in minimize built seed.start
(* example usage *)
let valid_char c = c != '\r' && c != '\n' && c != ' '
let explode s = String.to_seq s |> List.of_seq |> List.filter valid_char
let implode l = List.to_seq l |> String.of_seq
let rec remove_first a ls =
match ls with
| [] -> []
| (x :: xs) -> if x == a then xs else x :: (remove_first a xs)
let n_permutations (d : dafsa) (w : word) : StringSet.t =
let rec loop (to_match : word) (matched : string) (s :state) =
let ends_here =
if (IntSet.mem s d.final)
then StringSet.singleton matched
else StringSet.empty
in
let ends_lower =
match (IntMap.find_opt s d.transitions) with
| None -> StringSet.empty
| Some children ->
let fold perms c =
let rest = remove_first c to_match in
match (CharMap.find_opt c children) with
| None -> perms
| Some child ->
StringSet.union perms (loop rest (matched ^ (sprintf "%c" c)) child)
in List.fold_left fold StringSet.empty to_match
in StringSet.union ends_here ends_lower
in loop w "" d.start
let wordstream_of_file filename =
let open Lwt in
let open Lwt_io in
let stream = open_file filename ~mode:Input >>= fun input_channel ->
return (read_lines input_channel) >>= fun stream ->
return (Lwt_stream.map explode stream)
in Lwt_main.run stream
let print_transitions (d : dafsa) =
IntMap.iter (fun state children ->
CharMap.iter (fun c next ->
printf "%d %c %d\n" state c next
) children
) d.transitions
let print_register (d : dafsa) =
StringMap.iter (printf "%s -> %d\n") d.register
let () =
print_endline "Building dictionary...";
let stream = wordstream_of_file "./collins_scrabble_2019.txt"
in let scrabble = build stream
in
printf
"Dictionary built with %d states, %d final states, %d transitions \n"
(IntSet.cardinal scrabble.states)
(IntSet.cardinal scrabble.final)
(IntMap.fold (fun _ children sum -> sum + (CharMap.cardinal children)) scrabble.transitions 0);
let rec loop () =
print_endline "Enter a word";
let word = read_line () in
let perms = n_permutations scrabble (explode word) in
StringSet.iter print_endline perms;
printf "%d words found\n" (StringSet.cardinal perms);
loop ()
in loop ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment