Skip to content

Instantly share code, notes, and snippets.

@jackrusher
Created February 12, 2013 21:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jackrusher/4773826 to your computer and use it in GitHub Desktop.
Save jackrusher/4773826 to your computer and use it in GitHub Desktop.
A worked example for a friend showing how decision tree learning works.
;; Decision tree learning example taken from:
;; http://www.doc.ic.ac.uk/~sgc/teaching/pre2012/v231/lecture11.html
(defn log2
"log2(x), log2(0) = 0"
[x]
(if (= x 0) 0 (/ (Math/log x) (Math/log 2))))
;;;; Entropy
(defn entropy
"Entropy(S) = —p+log2(p+) —p- log2(p-)"
[p-pos p-neg]
(+ (* (- p-pos) (log2 p-pos))
(* (- p-neg) (log2 p-neg))))
(entropy 1/4 3/4) ;; 1 positive of four, three negatives of four
;; => 0.8112781244591328
;;;; Information gain
;;;;
;;;; Sv1 = {S4}, Sv2 = {S1, S2}, Sv3 = {S3}
(def S
[[:v2 :positive]
[:v2 :negative]
[:v3 :negative]
[:v1 :negative]])
;;;; (|Sv1|/|S|) * Entropy(Sv1)
;; Sv1 = subset of S with v1
(def Sv1 (filter #(= :v1 (first %)) S))
;; |Sv1| = how many in Sv1?
(def |Sv1| (count Sv1))
;; => 1
;;|S| = how many total in S?
(def |S| (count S))
;; => 4
;; Entropy(Sv1) = (entropy positive/|S| negative/|S|)
(def Sv1-pos (count (filter #(= :positive (second %)) Sv1)))
;; => 0
(def Sv1-neg (count (filter #(= :negative (second %)) Sv1)))
;; => 1
;; (|Sv1|/|S|) * Entropy(Sv1)
(* 1/4 (entropy 0/1 1/1))
;; => 0.0
;; This is "scaled entropy", programmatically:
(defn scaled-entropy [subset-size set-size n-pos n-neg]
(* (/ subset-size set-size)
(entropy
(if (= 0 n-pos) 0 (/ n-pos subset-size))
(if (= 0 n-neg) 0 (/ n-neg subset-size)))))
;; repeating the example:
(scaled-entropy |Sv1| |S| Sv1-pos Sv1-neg)
;; => 0.0
;; (|Sv2|/|S|) * Entropy(Sv2)
;; (* 2/4 (entropy 1/2 1/2))
(scaled-entropy 2 4 1 1)
;; = 0.5
;; (|Sv3|/|S|) * Entropy(Sv3)
(scaled-entropy 1 4 0 1)
;; => 0
;;;; Gain(S,A) = Entropy(S) - (Entropy(Sv1) + Entropy(Sv2) + Entropy(Sv3)
;;;; or: 0.811 - (0 + 0.5 + 0) = 0.311
(let [pos-in-s (count (filter #(= :positive (second %)) S))
neg-in-s (count (filter #(= :negative (second %)) S))
entropy-s (entropy (/ pos-in-s |S|) (/ neg-in-s |S|))
values-s (reduce (fn [a [v p]] (assoc-in a [v p] (inc (get-in a [v p] 0)))) {} S)
entropies (map #(let [n-pos (get-in values-s [% :positive] 0)
n-neg (get-in values-s [% :negative] 0)
subset-size (+ n-pos n-neg)]
(scaled-entropy subset-size |S| n-pos n-neg)) (keys values-s))]
(- entropy-s (reduce + entropies)))
;; => 0.31127812445913283
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment