-
-
Save liebke/afc8a7e35b3ed6286ee9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; LDA and QDA classifiers from chapter 4 of Elements of Statistical Learning | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; LDA | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
(use '(incanter core stats charts io)) | |
(def training (to-matrix | |
(read-dataset "http://bit.ly/464h4h" | |
:header true))) | |
(def testing (to-matrix | |
(read-dataset "http://bit.ly/1btCei" | |
:header true))) | |
(def K 11) | |
(def p 10) | |
(def N (nrow training)) | |
(def group-counts (map nrow (group-by training 1))) | |
;; estimate the prior probabilities for each cluster | |
(def prior-probs (div group-counts N)) | |
;; estimate the centroids for each cluster | |
(def cluster-centroids | |
(matrix | |
(for [x_k (group-by training 1 :cols (range 2 12))] | |
(map mean (trans x_k))))) | |
;; estimate the covariance matrix to be used for all clusters | |
(def cluster-cov-mat | |
(let [groups (group-by training 1 :cols (range 2 12))] | |
(reduce plus | |
(map (fn [group centroid n] | |
(reduce plus | |
(map #(div | |
(mmult (trans (minus % centroid)) | |
(minus % centroid)) | |
(- N K)) | |
group))) | |
groups cluster-centroids group-counts)))) | |
;; calculate the inverses of the cluster covariance matrices | |
(def inv-cluster-cov-mat (solve cluster-cov-mat)) | |
;; define the linear discriminant function (ldf) | |
(defn ldf [x Sigma-inv mu_k pi_k] | |
(+ (mmult x Sigma-inv (trans mu_k)) | |
(- (mult 1/2 (mmult mu_k Sigma-inv (trans mu_k)))) | |
(log pi_k))) | |
;; define a function to calculate the linear quadratic scores. | |
(defn calculate-scores | |
([data inv-cov-mat centroids priors] | |
(matrix | |
(pmap (fn [row] | |
(pmap (partial ldf row inv-cov-mat) | |
centroids | |
priors)) | |
(sel data :cols (range 2 12)))))) | |
;; calculate the scores for the training data | |
(def training-lda-scores | |
(calculate-scores training | |
inv-cluster-cov-mat | |
cluster-centroids | |
prior-probs)) | |
;; calculate the scores for the testing data | |
(def testing-lda-scores | |
(calculate-scores testing | |
inv-cluster-cov-mat | |
cluster-centroids | |
prior-probs)) | |
;;(bind-columns (sel training :cols 1) (plus 1 (map max-index training-lda-scores))) | |
;;(bind-columns (sel testing :cols 1) (plus 1 (map max-index testing-lda-scores))) | |
(defn max-index | |
"Returns the index of the maximum value in the given sequence." | |
([x] | |
(let [max-x (reduce max x) | |
n (length x)] | |
(loop [i 0] | |
(if (= (nth x i) max-x) | |
i | |
(recur (inc i))))))) | |
;; define a function to calculate the error rate | |
(defn error-rate [data scores] | |
(/ (sum (map #(if (= %1 %2) 0 1) | |
(sel data :cols 1) | |
(plus 1 (map max-index scores)))) | |
(nrow data))) | |
;; calculate the error rate for the training data (0.316) | |
(error-rate training training-lda-scores) | |
;; calculate the error rate for the testing data (0.56) | |
(error-rate testing testing-lda-scores) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; QDA | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
(use '(incanter core stats charts io)) | |
;(def training (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.train.txt" :header true))) | |
(def training (to-matrix | |
(read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.train" | |
:header true))) | |
;(def testing (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.test.txt" :header true))) | |
(def testing (to-matrix | |
(read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.test" | |
:header true))) | |
(def K 11) | |
(def p 10) | |
(def N (nrow training)) | |
(def group-counts (map nrow (group-by training 1))) | |
;; estimate the prior probabilities for each cluster | |
(def prior-probs (div group-counts N)) | |
;; estimate the centroids for each cluster | |
(def cluster-centroids (matrix | |
(for [x_k (group-by training 1 :cols (range 2 12))] | |
(map mean (trans x_k))))) | |
;;------------------------------------------------------------------------------ | |
;; CALCULATE THE K COVARIANCE MATRICES NECESSARY FOR QDA | |
;;------------------------------------------------------------------------------ | |
;; estimate the covariance matrices for each cluster | |
(def cluster-cov-mats | |
(let [groups (group-by training 1 :cols (range 2 12))] | |
(map (fn [group centroid n] | |
(reduce plus | |
(map #(div | |
(mmult (trans (minus % centroid)) | |
(minus % centroid)) | |
(dec n)) | |
group))) | |
groups cluster-centroids group-counts))) | |
;; calculate the inverses of the cluster covariance matrices | |
(def inv-cluster-cov-mats (map solve cluster-cov-mats)) | |
;; define the quadratic discriminant function | |
(defn qdf [x Sigma_k Sigma-inv_k mu_k pi_k] | |
(+ (- (mult 1/2 (log (det Sigma_k)))) | |
(- (mult 1/2 (mmult (minus x mu_k) | |
Sigma-inv_k | |
(trans (minus x mu_k))))) | |
(log pi_k))) | |
(defn calculate-scores | |
([data cov-mats inv-cov-mats centroids priors] | |
(matrix | |
(pmap (fn [row] | |
(pmap (partial qdf row) | |
cov-mats | |
inv-cov-mats | |
centroids | |
priors)) | |
(sel data :cols (range 2 12)))))) | |
;; calculate the scores for the training data | |
(def training-qda-scores | |
(calculate-scores training | |
cluster-cov-mats | |
inv-cluster-cov-mats | |
cluster-centroids | |
prior-probs)) | |
;; calculate the scores for the testing data | |
(def testing-qda-scores | |
(calculate-scores testing | |
cluster-cov-mats | |
inv-cluster-cov-mats | |
cluster-centroids | |
prior-probs)) | |
;;(bind-columns (sel training :cols 1) (plus 1 (map max-index training-qda-scores))) | |
;;(bind-columns (sel testing :cols 1) (plus 1 (map max-index testing-qda-scores))) | |
(defn max-index | |
"Returns the index of the maximum value in the given sequence." | |
([x] | |
(let [max-x (reduce max x) | |
n (length x)] | |
(loop [i 0] | |
(if (= (nth x i) max-x) | |
i | |
(recur (inc i))))))) | |
;; define a function to calculate the error rate | |
(defn error-rate [data scores] | |
(/ (sum (map #(if (= %1 %2) 0 1) | |
(sel data :cols 1) | |
(plus 1 (map max-index scores)))) | |
(nrow data))) | |
;; calculate the error rate for the training data | |
(error-rate training training-qda-scores) | |
;; calculate the error rate for the testing data | |
(error-rate testing testing-qda-scores) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; QDA w/ Eigenvalue Decomposition | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
(use '(incanter core stats charts io)) | |
;(def training (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.train.txt" :header true))) | |
(def training (to-matrix | |
(read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.train" | |
:header true))) | |
;(def testing (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.test.txt" :header true))) | |
(def testing (to-matrix | |
(read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.test" | |
:header true))) | |
(def K 11) | |
(def p 10) | |
(def N (nrow training)) | |
(def group-counts (map nrow (group-by training 1))) | |
;; estimate the prior probabilities for each cluster | |
(def prior-probs (div group-counts N)) | |
;; estimate the centroids for each cluster | |
(def cluster-centroids (matrix | |
(for [x_k (group-by training 1 :cols (range 2 12))] | |
(map mean (trans x_k))))) | |
;; estimate the covariance matrices for each cluster | |
(def cluster-cov-matrices | |
(let [groups (group-by training 1 :cols (range 2 12))] | |
(map (fn [group centroid n] | |
(reduce plus | |
(map #(div | |
(mmult (trans (minus % centroid)) | |
(minus % centroid)) | |
(dec n)) | |
group))) | |
groups cluster-centroids group-counts))) | |
;;------------------------------------------------------------------------------ | |
;; ADDED TO QDA EXAMPLE TO IMPROVE PERFORMANCE | |
;;------------------------------------------------------------------------------ | |
;; extract the eigenvalues and eigenvectors from the covariance matrices | |
;; for each cluster, to improve performance | |
(def Sigma-decomp | |
(map decomp-eigenvalue cluster-cov-matrices )) | |
(def D (map #(diag (:values %)) Sigma-decomp)) | |
(def U (map #(:vectors %) Sigma-decomp)) | |
;;------------------------------------------------------------------------------ | |
;; CHANGED FROM QDA EXAMPLE | |
;;------------------------------------------------------------------------------ | |
;; define the quadratic discriminant function using the eigenvalues and eigenvectors | |
(defn qdf [x D_k U_k mu_k pi_k] | |
(+ (- (mult 1/2 (sum (map log (diag D_k))))) | |
(- (mult 1/2 | |
(mmult (trans (mmult (trans U_k) | |
(trans (minus x mu_k)))) | |
(solve D_k) | |
(mmult (trans U_k) | |
(trans (minus x mu_k)))))) | |
(log pi_k))) | |
;;------------------------------------------------------------------------------ | |
;; END OF CHANGES | |
;;------------------------------------------------------------------------------ | |
;; define a function to calculate the quadratic discriminant scores | |
(defn calculate-scores | |
([data D U centroids priors] | |
(matrix | |
(pmap (fn [row] | |
(pmap (partial qdf row) D U centroids priors)) | |
(sel data :cols (range 2 12)))))) | |
;; calculate the scores for each row of the training data set across all 11 clusters | |
(def training-qda-scores | |
(calculate-scores training | |
D U | |
cluster-centroids | |
prior-probs)) | |
;; calculate the scores for each row of the testing data set across all 11 clusters | |
(def testing-qda-scores | |
(calculate-scores testing | |
D U | |
cluster-centroids | |
prior-probs)) | |
;;(bind-columns (sel training :cols 1) (plus 1 (map max-index training-qda-scores))) | |
;;(bind-columns (sel testing :cols 1) (plus 1 (map max-index testing-qda-scores))) | |
(defn max-index | |
"Returns the index of the maximum value in the given sequence." | |
([x] | |
(let [max-x (reduce max x) | |
n (length x)] | |
(loop [i 0] | |
(if (= (nth x i) max-x) | |
i | |
(recur (inc i))))))) | |
;; define a function to calculate the error rate | |
(defn error-rate [data scores] | |
(/ (sum (map #(if (= %1 %2) 0 1) | |
(sel data :cols 1) | |
(plus 1 (map max-index scores)))) | |
(nrow data))) | |
;; calculate the error rate for the training data | |
(error-rate training training-qda-scores) | |
;; calculate the error rate for the testing data | |
(error-rate testing testing-qda-scores) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment