Skip to content

Instantly share code, notes, and snippets.

@samklr samklr/anagrams.ml
Created Oct 29, 2012

Embed
What would you like to do?
Anagrams
(*s {\bf Anagrams.} The following program finds all the anagrams of a given
set of characters among a dictionary. Such a dictionary can be built by
the following program given a list of files containing words (one per
line). *)
(*s The dictionary is implemented as a \emph{trie}. It is a multi-branching
tree, where branches are labelled with characters. Each node contains
a boolean which says if the word corresponding to the path from the root
belongs to the dictionary. The branches are implemented as maps from
characters to dictionaries. *)
module Cmap = Map.Make(struct type t = char let compare = compare end)
type tree = Node of bool * tree Cmap.t
let empty = Node (false, Cmap.empty)
(*s Insertion of a new word in the trie is just a matter of descending in
the tree, taking the branches corresponding to the successive characters.
Each time a branch does not exist, we continue the insertion in a new
empty tree. When the insertion is done in the subtree, we update the
branching to the new subtree. *)
let add t w =
let n = String.length w in
let rec addrec i (Node (b,m) as t) =
if i = n then
if b then t else Node (true,m)
else
let c = w.[i] in
let br = try Cmap.find c m with Not_found -> empty in
let t' = addrec (succ i) br in
Node (b, Cmap.add c t' m)
in
addrec 0 t
(*s Even if it is not necessary, here is the function [mem] to tests
whether a word belongs or not to the dictionary. *)
let mem t w =
let n = String.length w in
let rec look i (Node (b,m)) =
if i = n then
b
else
try look (succ i) (Cmap.find w.[i] m) with Not_found -> false
in
look 0 t
(*s The algorithm for anagrans is the following. We start from the root
of the tree with all the initial characters. Then, for each
\emph{distinct} character [c], we descend in the corresponding branch,
and apply recursively the algorithm with \emph{one occurrence} of [c] being
removed. When the collection of characters is empty, we simply test
the boolean at the current node. Whenever a branch is missing, we stop
the exploration.
It appears that we need to deal with \emph{multi-sets} of characters.
Indeed, we have to keep the collection of characters which have not yet
been examined, and it may contain repetitions. *)
(*s Multi-sets of characters are implemented as maps from characters to
positive integers. The operations of insertion and deletion are
easily implemented. We also provide a function [ms_of_string] to
build the multi-set corresponding to a given string. *)
let ms_add c m =
try let n = Cmap.find c m in Cmap.add c (succ n) m
with Not_found -> Cmap.add c 1 m
let ms_remove c m =
let n = Cmap.find c m in
if n = 1 then Cmap.remove c m else Cmap.add c (pred n) m
let ms_of_string w =
let n = String.length w in
let rec add i = if i = n then Cmap.empty else ms_add w.[i] (add (succ i)) in
add 0
(*s Then implementing the above algorithm is rather easy. During the
exploration, we keep three values: first, the current path [pref]
from the root of the initial tree, in reverse order; secondly, the
current node being examined, [(b,m)]; and finally, the current
multi-set of characters [s]. *)
let subset = ref true
let rec print_prefix = function
| [] -> ()
| c::l -> print_prefix l; print_char c
let anagram d w =
let rec traverse pref (Node (b,m)) s =
if b && (s = Cmap.empty || !subset) then begin
print_prefix pref; print_newline ()
end;
Cmap.iter
(fun c _ ->
try traverse (c::pref) (Cmap.find c m) (ms_remove c s)
with Not_found -> ()) s
in
traverse [] d (ms_of_string w)
(*s Building the dictionary. The function [add_one_file] read all the
words contained in file [file] and inserts them in the tree [t].
Then function [build_dict] constructs the whole dictionary by
successively inserting the words for the given list of files. *)
let add_one_file t file =
Printf.printf "Reading %s\n" file; flush stdout;
let ch = open_in file in
let rec read t =
try let w = input_line ch in read (add t w) with End_of_file -> t
in
let t' = read t in close_in ch; t'
let build_dict = List.fold_left add_one_file empty
(*s The following function [print_all] prints all the words of a given
dictionary. Only used for checks (option \texttt{-a}). *)
let print_all d =
let rec traverse pref (Node (b, m)) =
if b then begin print_prefix pref; print_newline () end;
Cmap.iter (fun c t -> traverse (c::pref) t) m
in
traverse [] d
(*s The main program. It mainly provides two ways of invoking the program:
first, the option \texttt{-b} will build the dictionary from the given
files and put it in the file ["dict.out"];
secondly, the program invoked with a word on the command line
will print all the anagrams for this word.
Option \texttt{-e} specifies exact anagrams (i.e., with all characters
used). *)
let output_dict d =
let ch = open_out "dict.out" in output_value ch d; close_out ch
let input_dict () =
let ch = open_in "dict.out" in let d = input_value ch in close_in ch; d
let usage () =
prerr_endline "usage:";
prerr_endline " anagram -b files";
prerr_endline " anagram [-e] word"
let main () =
match List.tl (Array.to_list Sys.argv) with
| [] | "-h" :: _ -> usage ()
| "-a" :: _ -> let d = input_dict () in print_all d
| "-b" :: files -> let d = build_dict files in output_dict d
| "-e" :: w :: _ -> subset := false; let d = input_dict () in anagram d w
| w :: _ -> let d = input_dict () in anagram d w
let _ = Printexc.catch main ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.