Created
February 24, 2013 21:38
-
-
Save evanlh/5025768 to your computer and use it in GitHub Desktop.
Decision Tree Homework, first pass
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Begin Evan's attempt at section 11.3 | |
;; from http://www.doc.ic.ac.uk/~sgc/teaching/pre2012/v231/lecture11.html | |
(def examples | |
[{:weather :sunny, :parents :yes, :money :rich, :decision :cinema} | |
{:weather :sunny, :parents :no, :money :rich, :decision :tennis} | |
{:weather :windy, :parents :yes, :money :rich, :decision :cinema} | |
{:weather :rainy, :parents :yes, :money :poor, :decision :cinema} | |
{:weather :rainy, :parents :no, :money :rich, :decision :stayin} | |
{:weather :rainy, :parents :yes, :money :poor, :decision :cinema} | |
{:weather :windy, :parents :no, :money :poor, :decision :cinema} | |
{:weather :windy, :parents :no, :money :rich, :decision :shopping} | |
{:weather :windy, :parents :yes, :money :rich, :decision :cinema} | |
{:weather :sunny, :parents :no, :money :rich, :decision :tennis} | |
]) | |
;; c&p from jackrusher's worked example | |
(defn log2 | |
"log2(x), log2(0) = 0" | |
[x] | |
(if (= x 0) 0 (/ (Math/log x) (Math/log 2)))) | |
(defn proportion-by-attr | |
"slice S on attr, return seq of [:k v] pairs where v = count(k) / total" | |
[S attr] | |
(let [s (group-by attr S) | |
total (count S) | |
proportion #(vector (first %) (/ (count (second %)) total))] | |
(map #(proportion %) s))) | |
(defn entropy-seq [S] | |
(let [p (proportion-by-attr S :decision) | |
k (map first p) | |
v (map second p) | |
entropy_i #(* (- %) (log2 %))] | |
(map entropy_i v))) | |
(defn entropy | |
"Entropy(S) = SUMi=1..n -p_i* log2(p_i) where p_i is the proportion at i" | |
[S] | |
(reduce + (entropy-seq S))) | |
(entropy examples) | |
;; (0.44217935649972373 0.46438561897747244 0.33219280948873625 | |
;; 0.33219280948873625) | |
;; this is probably already a native clojure function but I can't find it | |
(defn S_v | |
"return a subset s of S where k = v for all k in S" | |
[S k v] | |
(filter #(and (contains? % k) (= (get % k) v)) S)) | |
(def |s_sun| (count (S_v examples :weather :sunny))) | |
;; 3 | |
(def |s_rain| (count (S_v examples :weather :rainy))) | |
;; 3 | |
(def s_sun (S_v examples :weather :sunny)) | |
(entropy s_sun) | |
(defn gain_sum_v | |
[Sv count_S] | |
(* (/ (count Sv) count_S) (entropy Sv))) | |
(defn gain_sum | |
[S attr count_S] | |
(let [values (keys (group-by attr S))] | |
(for [v values] (gain_sum_v (S_v S attr v) count_S)))) | |
(def count_ex (count examples)) | |
(gain_sum examples :weather count_ex) | |
;; (0.27548875021634683 0.32451124978365314 0.27548875021634683) | |
(defn gain | |
"return the information gain of S at attribute attr | |
Gain(S, attr) = Entropy(S) - Sum[v over Values(attr)] ((Count(Sv) / Count(S)) * Entropy(Sv))" | |
[S attr] | |
(let [entropy_S (entropy S) | |
count_S (count S)] | |
(- entropy_S (reduce + (gain_sum S attr count_S))))) | |
(gain examples :weather) | |
;; 0.6954618442383218 | |
(gain examples :parents) | |
;; 0.6099865470109875 | |
(gain examples :money) | |
;; 0.2812908992306926 | |
;; wheeee!!!! the sweet taste of success!! | |
;; now on to building the tree... | |
;; collecting misc phrases before trying to codify the algorithm... | |
(defn get-decisions [S] (get S :decision)) | |
(map get-decisions (S_v examples :weather :sunny)) | |
;; (:cinema :tennis :tennis) | |
(defn put-leaf? | |
"Returns true if the results of s (decisions) are all in the same category or empty" | |
[s] | |
(< (count (distinct (map get-decisions s))) 2)) | |
(put-leaf? (S_v examples :weather :sunny)) | |
;; false | |
(put-leaf? (S_v examples :weather :windy)) | |
;; false | |
(put-leaf? (S_v examples :weather :rainy)) | |
;; false | |
(def S_sunny (S_v examples :weather :sunny)) | |
(gain S_sunny :parents) | |
;; 0.9182958340544896 | |
(gain S_sunny :money) | |
;; 0.0 | |
;; parents wins | |
(put-leaf? (S_v S_sunny :parents :yes)) | |
;; true | |
(put-leaf? (S_v S_sunny :parents :no)) | |
;; true | |
(def S_windy (S_v examples :weather :windy)) | |
(gain S_windy :parents) | |
;; 0.31127812445913283 | |
(gain S_windy :money) | |
;; 0.12255624891826566 | |
;; parents wins... | |
(put-leaf? (S_v S_windy :parents :yes)) | |
;; true | |
(put-leaf? (S_v S_windy :parents :no)) | |
;; false | |
;; if put-leaf? == false ... defer to recurse into subtree? | |
(def S_rainy (S_v examples :weather :rainy)) | |
(gain S_rainy :parents) | |
;; 0.9182958340544896 | |
(gain S_rainy :money) | |
;; 0.9182958340544896 | |
;; hmm.. what to do here? nothing? | |
;; Algorithm attempt: | |
;; 1. Starting at the root node, select a from max Gain (S, A), where | |
;; a (- A | |
;; 2. Add branches from the root node for all values v of a | |
;; 3. For each branch, select slice S_v of S: | |
;; 3a. If the possible results of S_v are NOT empty or unique, put | |
;; an empty attribute node for later recursion | |
;; 3b. If the possible results of S_v are empty, ... delete the branch? | |
;; 3c. If the possible results of S_v are unique, add a (terminal) category | |
;; node for this unique result | |
;; 4. Recurse into each branch with an empty attribute node, | |
;; evaluating step 1 with S = S_x where S_x excludes all parent | |
;; attributes of the current node. | |
;; 5. Stop when no nodes are terminated by an empty attribute node ([]) | |
(defn put-leaf | |
"Handles case 3b and 3c above" | |
[S] | |
(let [decisions (distinct (map get-decisions S)) | |
c (count decisions)] | |
(if (= 0 c) | |
:delete | |
(first decisions)))) | |
(defn S_A | |
"Returns a list of attributes A in S" | |
[S] (filter #(not= % :decision) (keys (first S)))) | |
(defn gain-seq | |
[S A] | |
(map #(vector % (gain S %)) A)) | |
(gain-seq examples (S_A examples)) | |
;; ([:money 0.2812908992306926] [:parents 0.6099865470109875] | |
;; [:weather 0.6954618442383218]) | |
;; still fighting with this.... | |
(defn max-gain | |
"Given S, return a (- A for which max Gain (S, A)" | |
[S] | |
(let [A (S_A S)] | |
(map #(max )) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment