Created
March 12, 2015 12:50
-
-
Save jbarber/594de6f48e00642ecbb5 to your computer and use it in GitHub Desktop.
Huffman encoding example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn freq2leaves | |
"Convert a sequence of symbols and their frequency into a sequence of hash-maps with keys :sym :freq" | |
[xs] (map #(hash-map :sym (first %) :freq (second %)) xs)) | |
(defn huffman | |
"Perform Huffman encoding for the sequence xs of hash maps from freq1leaves. | |
Return tree of hash-maps with keys :freq and either :symbol (when a leaf) or | |
:children (when an internal node). :children is a vector of the child nodes." | |
[xs] | |
(letfn [(sort-queue [xs] (sort #(compare (:freq %1) (:freq %2)) xs)) | |
(new-node [xs] {:freq (reduce + (map :freq xs)) | |
:children xs})] | |
(let [ss (sort-queue xs)] | |
(if (= 1 (count ss)) | |
(first ss) | |
(huffman (cons | |
(new-node (take 2 ss)) | |
(drop 2 ss))))))) | |
(defn huffman2vec | |
"Convert the result of huffman to a seq of symbols and their encodings." | |
([x] (huffman2vec x [])) | |
([x t] | |
(if-not (:children x) | |
{(:sym x) t} | |
(conj | |
(huffman2vec (-> x :children first) (conj t ::0)) | |
(huffman2vec (-> x :children second) (conj t ::1)))))) | |
;; FIXME: What to do if invalid/unknown symbols are present in xs? | |
(defn encode | |
"Encode the sequence xs with the dictionary from huffman2vec" | |
[dict xs] (flatten (map dict xs))) | |
(defn decode | |
"Decode the sequence of symbols xs with the dictionary dict (generated | |
from huffman2vec). Returns nil if unable to find a sequence in dict in | |
xs." | |
[dict xs] | |
(let [inv-dict (clojure.set/map-invert dict) | |
upper (apply max (map count (vals dict))) | |
[sym length] (first | |
(for [length (range 1 (inc upper)) | |
:let [x (into [] (take length xs))] | |
:while (= length (count x)) | |
:when (get inv-dict x)] | |
[(get inv-dict x) length]))] | |
(if length | |
[sym (drop length xs)]))) | |
;; Load in a dictionary of words (one word per line) and create a frequency | |
;; table of the letters in the words, then create the huffman tree from the | |
;; frequencies and encode a random word from the dictionary. Finally, decode | |
;; the word and check the result is the same as the original. | |
(let [words (clojure.string/split (slurp "en.spell") #"\n") | |
freqs (frequencies (flatten (map seq words))) | |
dict (huffman2vec (huffman (freq2leaves freqs))) | |
word (seq (rand-nth words)) | |
encoded (encode dict word) | |
decoded (loop [xs encoded result []] | |
(if (empty? xs) | |
result | |
(let [[sym remaining] (decode dict xs)] | |
(recur remaining (conj result sym)))))] | |
(= word decoded)) | |
;; Test the creation of the Huffman tree. | |
;; n.b. the order in which children appear is not important. | |
(= | |
{:B [::1 ::0] :E [::1 ::1 ::1 ::0] :D [::1 ::1 ::1 ::1] :C [::1 ::1 ::0] :A [::0]} | |
(huffman2vec (huffman (freq2leaves {:A 24 :B 12 :C 10 :D 8 :E 4})))) | |
;; Test decode behaviour on unknown sequence | |
(let [freq {:A 24 :B 12 :C 10 :D 8 :E 4} | |
dict (huffman2vec (huffman (freq2leaves freq))) | |
ret (decode dict [::1])] | |
(nil? ret)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment