Skip to content

Instantly share code, notes, and snippets.

@honzabrecka
Last active February 20, 2017 10:14
Show Gist options
  • Save honzabrecka/1a7168ab6c1d9e40cfcd to your computer and use it in GitHub Desktop.
Save honzabrecka/1a7168ab6c1d9e40cfcd to your computer and use it in GitHub Desktop.
Bayesian classifier implemented in Clojure (well, it's actually in ClojureScript).
;------------------------------------------------
; bayesian
(def empty-training-set {:features {} :categories {}})
(defn train
[features category]
(reduce #(assoc-in %1 [:features %2] {category 1})
(assoc-in empty-training-set [:categories category] 1)
features))
(defn merge-trainings
[a b]
(merge-with (fn [a b]
(merge-with #(if (map? %1) (merge-with + %1 %2) (+ %1 %2)) a b)) a b))
(defn- inv-chi2
[chi, df]
(let [m (* chi 0.5)
exp (.exp js/Math (- m))
to (.floor js/Math (* df 0.5))
term (reduce #(conj %1 (* (first %1) (/ m %2))) (conj '() exp) (range 1 to))]
(min (reduce + 0 term) 1)))
(defn- get-categories
[training-set]
(keys (:categories training-set)))
(defn count-feature
[training-set feature category]
(if (and
(contains? (:features training-set) feature)
(contains? (:categories training-set) category))
(get-in training-set [:features feature category])
0))
(defn count-category
[training-set category]
(if (contains? (:categories training-set) category)
(get-in training-set [:categories category])
0))
(defn count-total
[training-set]
(reduce + 0 (vals (:categories training-set))))
(defn feature-probability
[training-set feature category]
(let [c (count-category training-set category)
f (count-feature training-set feature category)]
(if (= c 0)
0
(/ f c))))
(defn category-probability
[training-set feature category]
(let [fp (feature-probability training-set feature category)]
(if (= fp 0)
0
(/ fp (reduce #(+ %1 (feature-probability training-set feature %2)) 0 (get-categories training-set))))))
(defn weighted-probability
[training-set feature category f]
(let [totals (reduce #(+ %1 (count-feature training-set feature %2)) 0 (get-categories training-set))
probability (f training-set feature category)
start 0.5]
(/ (+ start (* totals probability)) (+ totals 1))))
(defn document-probability
[training-set features category]
(reduce #(* %1 (weighted-probability training-set %2 category feature-probability)) 1 features))
(defn naive-probability
[training-set features category]
(* (document-probability training-set features category)
(/ (count-category training-set category) (count-total training-set))))
(defn fisher-probability
[training-set features category]
(inv-chi2
(* -2 (js/Math.log (reduce #(* %1 (weighted-probability training-set %2 category category-probability)) 1 features)))
(* 2 (count features))))
(defn clasify
[f training-set features]
(reduce #(assoc %1 %2 (f training-set features %2)) {} (get-categories training-set)))
;------------------------------------------------
; use
(defn get-words
[sentence]
(->> (str/split sentence " ")
(filter #(let [l (count %)]
(and (> l 1) (< l 20))))
(map str/lower-case)
(distinct)))
(def raw-data [["hey Honza what are you doing" "good"]
["make quick money now" "bad"]
["watch this quick brown fox" "good"]])
(def trained-data (reduce
#(merge-trainings %1 (train (get-words (nth %2 0)) (nth %2 1)))
empty-training-set
raw-data))
(clasify fisher-probability trained-data (get-words "do you want to make some quick money"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment