Last active
February 20, 2017 10:14
-
-
Save honzabrecka/1a7168ab6c1d9e40cfcd to your computer and use it in GitHub Desktop.
Bayesian classifier implemented in Clojure (well, it's actually in ClojureScript).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;------------------------------------------------ | |
; bayesian | |
(def empty-training-set {:features {} :categories {}}) | |
(defn train | |
[features category] | |
(reduce #(assoc-in %1 [:features %2] {category 1}) | |
(assoc-in empty-training-set [:categories category] 1) | |
features)) | |
(defn merge-trainings | |
[a b] | |
(merge-with (fn [a b] | |
(merge-with #(if (map? %1) (merge-with + %1 %2) (+ %1 %2)) a b)) a b)) | |
(defn- inv-chi2 | |
[chi, df] | |
(let [m (* chi 0.5) | |
exp (.exp js/Math (- m)) | |
to (.floor js/Math (* df 0.5)) | |
term (reduce #(conj %1 (* (first %1) (/ m %2))) (conj '() exp) (range 1 to))] | |
(min (reduce + 0 term) 1))) | |
(defn- get-categories | |
[training-set] | |
(keys (:categories training-set))) | |
(defn count-feature | |
[training-set feature category] | |
(if (and | |
(contains? (:features training-set) feature) | |
(contains? (:categories training-set) category)) | |
(get-in training-set [:features feature category]) | |
0)) | |
(defn count-category | |
[training-set category] | |
(if (contains? (:categories training-set) category) | |
(get-in training-set [:categories category]) | |
0)) | |
(defn count-total | |
[training-set] | |
(reduce + 0 (vals (:categories training-set)))) | |
(defn feature-probability | |
[training-set feature category] | |
(let [c (count-category training-set category) | |
f (count-feature training-set feature category)] | |
(if (= c 0) | |
0 | |
(/ f c)))) | |
(defn category-probability | |
[training-set feature category] | |
(let [fp (feature-probability training-set feature category)] | |
(if (= fp 0) | |
0 | |
(/ fp (reduce #(+ %1 (feature-probability training-set feature %2)) 0 (get-categories training-set)))))) | |
(defn weighted-probability | |
[training-set feature category f] | |
(let [totals (reduce #(+ %1 (count-feature training-set feature %2)) 0 (get-categories training-set)) | |
probability (f training-set feature category) | |
start 0.5] | |
(/ (+ start (* totals probability)) (+ totals 1)))) | |
(defn document-probability | |
[training-set features category] | |
(reduce #(* %1 (weighted-probability training-set %2 category feature-probability)) 1 features)) | |
(defn naive-probability | |
[training-set features category] | |
(* (document-probability training-set features category) | |
(/ (count-category training-set category) (count-total training-set)))) | |
(defn fisher-probability | |
[training-set features category] | |
(inv-chi2 | |
(* -2 (js/Math.log (reduce #(* %1 (weighted-probability training-set %2 category category-probability)) 1 features))) | |
(* 2 (count features)))) | |
(defn clasify | |
[f training-set features] | |
(reduce #(assoc %1 %2 (f training-set features %2)) {} (get-categories training-set))) | |
;------------------------------------------------ | |
; use | |
(defn get-words | |
[sentence] | |
(->> (str/split sentence " ") | |
(filter #(let [l (count %)] | |
(and (> l 1) (< l 20)))) | |
(map str/lower-case) | |
(distinct))) | |
(def raw-data [["hey Honza what are you doing" "good"] | |
["make quick money now" "bad"] | |
["watch this quick brown fox" "good"]]) | |
(def trained-data (reduce | |
#(merge-trainings %1 (train (get-words (nth %2 0)) (nth %2 1))) | |
empty-training-set | |
raw-data)) | |
(clasify fisher-probability trained-data (get-words "do you want to make some quick money")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment