Skip to content

Instantly share code, notes, and snippets.

@tjennings
Created August 3, 2011 18:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tjennings/1123487 to your computer and use it in GitHub Desktop.
Save tjennings/1123487 to your computer and use it in GitHub Desktop.
(ns fraud.detectors.bktree)
(defn root [distance-fn] {:distance-fn distance-fn :children {} :matches []})
(defn new-node [term] {:term term :children {} :matches []})
(defn empty-node? [node]
(= nil (:term node)))
(defn insert
([element node]
(insert element node (:distance-fn node)))
([element node distance-fn]
(if (empty-node? node)
(assoc node :term element)
(let [dist (distance-fn (:term node) element)
children (:children node)
child (get children dist)]
(if (= 0 dist)
(merge-with conj node {:matches element})
(if child
(assoc-in node [:children dist] (insert element child distance-fn))
(assoc-in node [:children dist] (new-node element)))))
)))
(defn query
([term threshold node]
(flatten (query term threshold node (:distance-fn node) [])))
([term threshold node distance-fn matches]
(let [dist (distance-fn (:term node) term)
query-range (range (- dist threshold) (+ 1 (+ dist threshold)))
to-query (remove nil? (map (fn [score] (get (:children node) score)) query-range))]
(concat (if (<= dist threshold) [node])
matches
(map (fn [n] (query term threshold n distance-fn matches)) to-query)))))
(defn treemap [a-fn root]
(if (:term root)
(map a-fn (tree-seq (fn is-branch? [node] (seq (:children node)))
(fn children [node] (vals (:children node)))
root))
(list)))
;; Specs
(ns fraud.detectors.bktree-spec
(:use
[fraud.spec-helper]
[fraud.detectors.bktree :as bk]
[fraud.detectors.hamming]
; [fraud.detectors.levenshtein]
; [fraud.detectors.lcss]
; [fraud.detectors.n_grams]
[speclj.core]))
(defn should-have-same-contents [expected actual]
(should= (sort expected) (sort actual)))
(def chosen-distance-function hamming-distance)
(describe "building a tree"
(it "uses the first term as the root"
(let [tree (bk/insert "a" (bk/root chosen-distance-function))]
(should= "a" (:term tree))
(should= 0 (count (:children tree)))))
(with tree (->> (bk/root chosen-distance-function)
(bk/insert "a")
(bk/insert "b")
(bk/insert "b")))
(it "collects matches in a matched node"
(should-have-same-contents ["b"] (:matches (get (:children @tree) 1))))
(it "inserts terms as child nodes in a sub-tree based on the node distance from the current node"
(should= "b" (:term (get (:children @tree) 1))))
)
(describe "mapping a tree"
(with tree (bk/insert "b" (bk/insert "b" (bk/insert "a" (bk/root chosen-distance-function)))))
(it "maps the tree into a vector of nodes"
(should-have-same-contents ["a" "b"] (bk/treemap (fn [n] (:term n)) @tree)))
(it "returns an empty list when the tree is empty"
(should= true (empty? (bk/treemap (fn [n] n) (bk/root chosen-distance-function))))))
(describe "querying a tree"
(with tree (->> (bk/root chosen-distance-function)
(bk/insert "a")
(bk/insert "b")
(bk/insert "c")
(bk/insert "bb")))
(it "finds all nodes with a distance of 0 from the query term"
(let [result (bk/query "a" 0 @tree)]
(should-have-same-contents ["a"] (map :term result)))
(let [result (bk/query "c" 0 @tree)]
(should-have-same-contents ["c"] (map :term result))))
(it "finds all nodes with a distance of 1 from the query term"
(let [result (bk/query "a" 1 @tree)]
(should-have-same-contents ["a" "b" "c"] (map :term result))))
(it "works for Bo's experiment"
(let [tree (->> (bk/root chosen-distance-function)
(bk/insert "aa")
(bk/insert "ab")
(bk/insert "ba"))
result (bk/query "aa" 1 tree)]
(should-have-same-contents ["aa" "ab" "ba"] (map :term result))))
(describe "with Kevin's big tree"
(with tree (->> (bk/root chosen-distance-function)
(bk/insert "book")
(bk/insert "rook")
(bk/insert "nook")
(bk/insert "took")
(bk/insert "look")
(bk/insert "shook")
(bk/insert "hand")
(bk/insert "sand")
(bk/insert "handle")
(bk/insert "handler")
(bk/insert "handles")
(bk/insert "handlers")
(bk/insert "bland")))
(it "finds all terms 1 step from the root"
(let [result (bk/query "book" 1 @tree)]
(should-have-same-contents ["book" "rook" "nook" "took" "look"] (map :term result))))
(it "finds all terms including root 1 step from the leaf of a branch"
(let [result (bk/query "look" 1 @tree)]
(should-have-same-contents ["book" "rook" "nook" "took" "look"] (map :term result))))
(it "finds all terms including root 1 step from the query, but on an alternate branch"
(let [result (bk/query "sand" 1 @tree)]
(should-have-same-contents ["sand" "hand"] (map :term result))))
(it "finds all terms including root 1 step from a leaf node on an alternate subtree"
(let [result (bk/query "hand" 1 @tree)]
(should-have-same-contents ["sand" "hand"] (map :term result))))
; FAILS because we're using a padded hamming distance as the query fn, we should use something else
; (it "finds all terms that are two steps away from a node"
; (let [result (bk/query "shook" 2 @tree)]
; (should-have-same-contents ["book" "rook" "nook" "took" "look" "shook"] (map :term result))))
; (it "finds results that have common suffix within threshold"
; (let [result (bk/query "bland" 2 @tree)]
; (should-have-same-contents ["bland" "sand" "hand"] (map :term result))))
; (it "finds results that have common prefix within threshold"
; (let [result (bk/query "handle" 2 @tree)]
; (should-have-same-contents ["hand" "handle" "handler" "handles" "handlers"] (map :term result))))
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment