jackrusher/decision_tree1.clj

## decision_tree1.clj
;; Decision tree learning example taken from:
;; http://www.doc.ic.ac.uk/~sgc/teaching/pre2012/v231/lecture11.html

(defn log2
  "log2(x), log2(0) = 0"
  [x]
  (if (= x 0) 0 (/ (Math/log x) (Math/log 2))))

;;;; Entropy
(defn entropy
  "Entropy(S) = —p+log2(p+) —p- log2(p-)"
  [p-pos p-neg]
  (+ (* (- p-pos) (log2 p-pos))
     (* (- p-neg) (log2 p-neg))))

(entropy 1/4 3/4) ;; 1 positive of four, three negatives of four
;; => 0.8112781244591328

;;;; Information gain
;;;;
;;;; Sv1 = {S4}, Sv2 = {S1, S2}, Sv3 = {S3}
(def S
  [[:v2 :positive]
   [:v2 :negative]
   [:v3 :negative]
   [:v1 :negative]])

;;;; (|Sv1|/|S|) * Entropy(Sv1)
;; Sv1 = subset of S with v1
(def Sv1 (filter #(= :v1 (first %)) S))

;; |Sv1| = how many in Sv1?
(def |Sv1| (count Sv1))
;; => 1

;;|S| = how many total in S?
(def |S| (count S))
;; => 4

;; Entropy(Sv1) = (entropy positive/|S| negative/|S|)
(def Sv1-pos (count (filter #(= :positive (second %)) Sv1)))
;; => 0

(def Sv1-neg (count (filter #(= :negative (second %)) Sv1)))
;; => 1

;; (|Sv1|/|S|) * Entropy(Sv1)
(* 1/4 (entropy 0/1 1/1))
;; => 0.0

;; This is "scaled entropy", programmatically:
(defn scaled-entropy [subset-size set-size n-pos n-neg]
  (* (/ subset-size set-size)
     (entropy
      (if (= 0 n-pos) 0 (/ n-pos subset-size))
      (if (= 0 n-neg) 0 (/ n-neg subset-size)))))

;; repeating the example:
(scaled-entropy |Sv1| |S| Sv1-pos Sv1-neg)
;; => 0.0

;; (|Sv2|/|S|) * Entropy(Sv2)
;; (* 2/4 (entropy 1/2 1/2))
(scaled-entropy 2 4 1 1)
;; = 0.5

;; (|Sv3|/|S|) * Entropy(Sv3)
(scaled-entropy 1 4 0 1)
;; => 0

;;;; Gain(S,A) = Entropy(S) - (Entropy(Sv1) + Entropy(Sv2) + Entropy(Sv3)
;;;; or: 0.811 - (0 + 0.5 + 0) = 0.311
(let [pos-in-s   (count (filter #(= :positive (second %)) S))
      neg-in-s   (count (filter #(= :negative (second %)) S))
      entropy-s  (entropy (/ pos-in-s |S|) (/ neg-in-s |S|))
      values-s   (reduce (fn [a [v p]] (assoc-in a [v p] (inc (get-in a [v p] 0)))) {} S)
      entropies  (map #(let [n-pos (get-in values-s [% :positive] 0)
                             n-neg (get-in values-s [% :negative] 0)
                             subset-size (+ n-pos n-neg)]
                         (scaled-entropy subset-size |S| n-pos n-neg)) (keys values-s))]
  (- entropy-s (reduce + entropies)))
;; => 0.31127812445913283
	;; Decision tree learning example taken from:
	;; http://www.doc.ic.ac.uk/~sgc/teaching/pre2012/v231/lecture11.html

	(defn log2
	"log2(x), log2(0) = 0"
	[x]
	(if (= x 0) 0 (/ (Math/log x) (Math/log 2))))

	;;;; Entropy
	(defn entropy
	"Entropy(S) = —p+log2(p+) —p- log2(p-)"
	[p-pos p-neg]
	(+ (* (- p-pos) (log2 p-pos))
	(* (- p-neg) (log2 p-neg))))

	(entropy 1/4 3/4) ;; 1 positive of four, three negatives of four
	;; => 0.8112781244591328

	;;;; Information gain
	;;;;
	;;;; Sv1 = {S4}, Sv2 = {S1, S2}, Sv3 = {S3}
	(def S
	[[:v2 :positive]
	[:v2 :negative]
	[:v3 :negative]
	[:v1 :negative]])

	;;;; (\|Sv1\|/\|S\|) * Entropy(Sv1)
	;; Sv1 = subset of S with v1
	(def Sv1 (filter #(= :v1 (first %)) S))

	;; \|Sv1\| = how many in Sv1?
	(def \|Sv1\| (count Sv1))
	;; => 1

	;;\|S\| = how many total in S?
	(def \|S\| (count S))
	;; => 4

	;; Entropy(Sv1) = (entropy positive/\|S\| negative/\|S\|)
	(def Sv1-pos (count (filter #(= :positive (second %)) Sv1)))
	;; => 0

	(def Sv1-neg (count (filter #(= :negative (second %)) Sv1)))
	;; => 1

	;; (\|Sv1\|/\|S\|) * Entropy(Sv1)
	(* 1/4 (entropy 0/1 1/1))
	;; => 0.0

	;; This is "scaled entropy", programmatically:
	(defn scaled-entropy [subset-size set-size n-pos n-neg]
	(* (/ subset-size set-size)
	(entropy
	(if (= 0 n-pos) 0 (/ n-pos subset-size))
	(if (= 0 n-neg) 0 (/ n-neg subset-size)))))

	;; repeating the example:
	(scaled-entropy \|Sv1\| \|S\| Sv1-pos Sv1-neg)
	;; => 0.0

	;; (\|Sv2\|/\|S\|) * Entropy(Sv2)
	;; (* 2/4 (entropy 1/2 1/2))
	(scaled-entropy 2 4 1 1)
	;; = 0.5

	;; (\|Sv3\|/\|S\|) * Entropy(Sv3)
	(scaled-entropy 1 4 0 1)
	;; => 0

	;;;; Gain(S,A) = Entropy(S) - (Entropy(Sv1) + Entropy(Sv2) + Entropy(Sv3)
	;;;; or: 0.811 - (0 + 0.5 + 0) = 0.311
	(let [pos-in-s (count (filter #(= :positive (second %)) S))
	neg-in-s (count (filter #(= :negative (second %)) S))
	entropy-s (entropy (/ pos-in-s \|S\|) (/ neg-in-s \|S\|))
	values-s (reduce (fn [a [v p]] (assoc-in a [v p] (inc (get-in a [v p] 0)))) {} S)
	entropies (map #(let [n-pos (get-in values-s [% :positive] 0)
	n-neg (get-in values-s [% :negative] 0)
	subset-size (+ n-pos n-neg)]
	(scaled-entropy subset-size \|S\| n-pos n-neg)) (keys values-s))]
	(- entropy-s (reduce + entropies)))
	;; => 0.31127812445913283