Skip to content

Instantly share code, notes, and snippets.

@evanlh
Created February 24, 2013 21:38
Show Gist options
  • Save evanlh/5025768 to your computer and use it in GitHub Desktop.
Save evanlh/5025768 to your computer and use it in GitHub Desktop.
Decision Tree Homework, first pass
;; Begin Evan's attempt at section 11.3
;; from http://www.doc.ic.ac.uk/~sgc/teaching/pre2012/v231/lecture11.html
(def examples
[{:weather :sunny, :parents :yes, :money :rich, :decision :cinema}
{:weather :sunny, :parents :no, :money :rich, :decision :tennis}
{:weather :windy, :parents :yes, :money :rich, :decision :cinema}
{:weather :rainy, :parents :yes, :money :poor, :decision :cinema}
{:weather :rainy, :parents :no, :money :rich, :decision :stayin}
{:weather :rainy, :parents :yes, :money :poor, :decision :cinema}
{:weather :windy, :parents :no, :money :poor, :decision :cinema}
{:weather :windy, :parents :no, :money :rich, :decision :shopping}
{:weather :windy, :parents :yes, :money :rich, :decision :cinema}
{:weather :sunny, :parents :no, :money :rich, :decision :tennis}
])
;; c&p from jackrusher's worked example
(defn log2
"log2(x), log2(0) = 0"
[x]
(if (= x 0) 0 (/ (Math/log x) (Math/log 2))))
(defn proportion-by-attr
"slice S on attr, return seq of [:k v] pairs where v = count(k) / total"
[S attr]
(let [s (group-by attr S)
total (count S)
proportion #(vector (first %) (/ (count (second %)) total))]
(map #(proportion %) s)))
(defn entropy-seq [S]
(let [p (proportion-by-attr S :decision)
k (map first p)
v (map second p)
entropy_i #(* (- %) (log2 %))]
(map entropy_i v)))
(defn entropy
"Entropy(S) = SUMi=1..n -p_i* log2(p_i) where p_i is the proportion at i"
[S]
(reduce + (entropy-seq S)))
(entropy examples)
;; (0.44217935649972373 0.46438561897747244 0.33219280948873625
;; 0.33219280948873625)
;; this is probably already a native clojure function but I can't find it
(defn S_v
"return a subset s of S where k = v for all k in S"
[S k v]
(filter #(and (contains? % k) (= (get % k) v)) S))
(def |s_sun| (count (S_v examples :weather :sunny)))
;; 3
(def |s_rain| (count (S_v examples :weather :rainy)))
;; 3
(def s_sun (S_v examples :weather :sunny))
(entropy s_sun)
(defn gain_sum_v
[Sv count_S]
(* (/ (count Sv) count_S) (entropy Sv)))
(defn gain_sum
[S attr count_S]
(let [values (keys (group-by attr S))]
(for [v values] (gain_sum_v (S_v S attr v) count_S))))
(def count_ex (count examples))
(gain_sum examples :weather count_ex)
;; (0.27548875021634683 0.32451124978365314 0.27548875021634683)
(defn gain
"return the information gain of S at attribute attr
Gain(S, attr) = Entropy(S) - Sum[v over Values(attr)] ((Count(Sv) / Count(S)) * Entropy(Sv))"
[S attr]
(let [entropy_S (entropy S)
count_S (count S)]
(- entropy_S (reduce + (gain_sum S attr count_S)))))
(gain examples :weather)
;; 0.6954618442383218
(gain examples :parents)
;; 0.6099865470109875
(gain examples :money)
;; 0.2812908992306926
;; wheeee!!!! the sweet taste of success!!
;; now on to building the tree...
;; collecting misc phrases before trying to codify the algorithm...
(defn get-decisions [S] (get S :decision))
(map get-decisions (S_v examples :weather :sunny))
;; (:cinema :tennis :tennis)
(defn put-leaf?
"Returns true if the results of s (decisions) are all in the same category or empty"
[s]
(< (count (distinct (map get-decisions s))) 2))
(put-leaf? (S_v examples :weather :sunny))
;; false
(put-leaf? (S_v examples :weather :windy))
;; false
(put-leaf? (S_v examples :weather :rainy))
;; false
(def S_sunny (S_v examples :weather :sunny))
(gain S_sunny :parents)
;; 0.9182958340544896
(gain S_sunny :money)
;; 0.0
;; parents wins
(put-leaf? (S_v S_sunny :parents :yes))
;; true
(put-leaf? (S_v S_sunny :parents :no))
;; true
(def S_windy (S_v examples :weather :windy))
(gain S_windy :parents)
;; 0.31127812445913283
(gain S_windy :money)
;; 0.12255624891826566
;; parents wins...
(put-leaf? (S_v S_windy :parents :yes))
;; true
(put-leaf? (S_v S_windy :parents :no))
;; false
;; if put-leaf? == false ... defer to recurse into subtree?
(def S_rainy (S_v examples :weather :rainy))
(gain S_rainy :parents)
;; 0.9182958340544896
(gain S_rainy :money)
;; 0.9182958340544896
;; hmm.. what to do here? nothing?
;; Algorithm attempt:
;; 1. Starting at the root node, select a from max Gain (S, A), where
;; a (- A
;; 2. Add branches from the root node for all values v of a
;; 3. For each branch, select slice S_v of S:
;; 3a. If the possible results of S_v are NOT empty or unique, put
;; an empty attribute node for later recursion
;; 3b. If the possible results of S_v are empty, ... delete the branch?
;; 3c. If the possible results of S_v are unique, add a (terminal) category
;; node for this unique result
;; 4. Recurse into each branch with an empty attribute node,
;; evaluating step 1 with S = S_x where S_x excludes all parent
;; attributes of the current node.
;; 5. Stop when no nodes are terminated by an empty attribute node ([])
(defn put-leaf
"Handles case 3b and 3c above"
[S]
(let [decisions (distinct (map get-decisions S))
c (count decisions)]
(if (= 0 c)
:delete
(first decisions))))
(defn S_A
"Returns a list of attributes A in S"
[S] (filter #(not= % :decision) (keys (first S))))
(defn gain-seq
[S A]
(map #(vector % (gain S %)) A))
(gain-seq examples (S_A examples))
;; ([:money 0.2812908992306926] [:parents 0.6099865470109875]
;; [:weather 0.6954618442383218])
;; still fighting with this....
(defn max-gain
"Given S, return a (- A for which max Gain (S, A)"
[S]
(let [A (S_A S)]
(map #(max ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment