Skip to content

Instantly share code, notes, and snippets.

@jbarber
Created March 12, 2015 12:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbarber/594de6f48e00642ecbb5 to your computer and use it in GitHub Desktop.
Save jbarber/594de6f48e00642ecbb5 to your computer and use it in GitHub Desktop.
Huffman encoding example
(defn freq2leaves
"Convert a sequence of symbols and their frequency into a sequence of hash-maps with keys :sym :freq"
[xs] (map #(hash-map :sym (first %) :freq (second %)) xs))
(defn huffman
"Perform Huffman encoding for the sequence xs of hash maps from freq1leaves.
Return tree of hash-maps with keys :freq and either :symbol (when a leaf) or
:children (when an internal node). :children is a vector of the child nodes."
[xs]
(letfn [(sort-queue [xs] (sort #(compare (:freq %1) (:freq %2)) xs))
(new-node [xs] {:freq (reduce + (map :freq xs))
:children xs})]
(let [ss (sort-queue xs)]
(if (= 1 (count ss))
(first ss)
(huffman (cons
(new-node (take 2 ss))
(drop 2 ss)))))))
(defn huffman2vec
"Convert the result of huffman to a seq of symbols and their encodings."
([x] (huffman2vec x []))
([x t]
(if-not (:children x)
{(:sym x) t}
(conj
(huffman2vec (-> x :children first) (conj t ::0))
(huffman2vec (-> x :children second) (conj t ::1))))))
;; FIXME: What to do if invalid/unknown symbols are present in xs?
(defn encode
"Encode the sequence xs with the dictionary from huffman2vec"
[dict xs] (flatten (map dict xs)))
(defn decode
"Decode the sequence of symbols xs with the dictionary dict (generated
from huffman2vec). Returns nil if unable to find a sequence in dict in
xs."
[dict xs]
(let [inv-dict (clojure.set/map-invert dict)
upper (apply max (map count (vals dict)))
[sym length] (first
(for [length (range 1 (inc upper))
:let [x (into [] (take length xs))]
:while (= length (count x))
:when (get inv-dict x)]
[(get inv-dict x) length]))]
(if length
[sym (drop length xs)])))
;; Load in a dictionary of words (one word per line) and create a frequency
;; table of the letters in the words, then create the huffman tree from the
;; frequencies and encode a random word from the dictionary. Finally, decode
;; the word and check the result is the same as the original.
(let [words (clojure.string/split (slurp "en.spell") #"\n")
freqs (frequencies (flatten (map seq words)))
dict (huffman2vec (huffman (freq2leaves freqs)))
word (seq (rand-nth words))
encoded (encode dict word)
decoded (loop [xs encoded result []]
(if (empty? xs)
result
(let [[sym remaining] (decode dict xs)]
(recur remaining (conj result sym)))))]
(= word decoded))
;; Test the creation of the Huffman tree.
;; n.b. the order in which children appear is not important.
(=
{:B [::1 ::0] :E [::1 ::1 ::1 ::0] :D [::1 ::1 ::1 ::1] :C [::1 ::1 ::0] :A [::0]}
(huffman2vec (huffman (freq2leaves {:A 24 :B 12 :C 10 :D 8 :E 4}))))
;; Test decode behaviour on unknown sequence
(let [freq {:A 24 :B 12 :C 10 :D 8 :E 4}
dict (huffman2vec (huffman (freq2leaves freq)))
ret (decode dict [::1])]
(nil? ret))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment