Skip to content

Instantly share code, notes, and snippets.

@willtim
Created January 21, 2012 00:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save willtim/1650422 to your computer and use it in GitHub Desktop.
Save willtim/1650422 to your computer and use it in GitHub Desktop.
Building a book index
(use '[clojure.core.match :only [match]])
;; quick and simple trie implementation from the description
;; in the book "Purely Functional Data Structures" by Okasaki
(def empty-trie [:trie :value nil :edges {}])
(defn lookup [key trie]
(match [key trie]
[[] [:trie :value nil :edges _]] nil
[[] [:trie :value x :edges _]] x
[[k & ks] [:trie :value v :edges m]] (lookup ks (get m k))
))
(defn bind [key val trie]
(match [key val trie]
[[] x [:trie :value nil :edges m]] [:trie :value x :edges m]
[[k & ks] x [:trie :value v :edges m]]
(let [t (get m k empty-trie)
t* (bind ks x t)]
[:trie :value v :edges (assoc m k t*)])))
;; test the trie
(= 1 (lookup (vec "one")
(bind (vec "one") 1 empty-trie)))
(= 2
(->>
empty-trie
(bind (vec "one") 1)
(bind (vec "two") 2)
(bind (vec "three") 3)
(lookup (vec "two"))))
;;;;;; using this trie to generate an index for a book
;; the book
(def example-book ["Page 1. The quick brown fox jumps over the lazy dog"
"Page 2. All good things must come to and end"
"Page 3. The older the fiddler the sweeter the tune"])
;; the phrases we want to limit the index too
(def custom-phrases ["good things" "lazy dog" "dog" "brown fox" "fox" "fiddler"])
;;;;;; build the tries from the pages
(defn tokenize [s] (seq (.split #" " s)))
(defn index-page [trie count page]
(let [words (tokenize page)
phrases (map (fn [[a b]] (apply str a " " b))
(partition 2 (interleave words (rest words))))
trie* (reduce
(fn [trie word] (bind (vec word) count trie))
trie words)
trie** (reduce
(fn [trie phrase] (bind (vec phrase) count trie))
trie* phrases)]
trie**))
(defn index-book [pages]
(let [[trie _]
(reduce (fn [[trie count] page]
[(index-page trie count page) (inc count)])
[empty-trie 1]
pages)]
trie))
;; test
(= 1 (lookup (vec "lazy dog") (index-book example-book)))
(= 3 (lookup (vec "fiddler") (index-book example-book)))
;; finally we build the custom index
(defn build-custom-index [pages phrases]
(let [full-index (index-book pages)]
(map (fn [item] [item (lookup (vec item) full-index)]) (sort phrases))))
;; test
(= [ ["brown fox" 1] ["dog" 1] ["fiddler" 3]
["fox" 1] ["good things" 2] ["lazy dog" 1]]
(build-custom-index example-book custom-phrases))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment