-
-
Save DrSkippy/dc92b406b4e97aa2e694 to your computer and use it in GitHub Desktop.
Demonstrate simple problem of machine learning and dimension reduction with Principle Component Analysis (PCA)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# SEE http://blog.drskippy.net/ for writeup | |
# Demonstrate simple problem of machine learning and dimension | |
# reduction with Principle Component Analysis (PCA). Use Singular Value | |
# Decomposition (SVD) to make this as understandable as possible. | |
# | |
# Scott Hendrickson 2012-05-11 | |
# | |
# Many good references on the mathematics behind SVD and PCA. There are also | |
# efficient algorithms for calculating the principle components without | |
# explicitly solving the SVD problem. | |
# | |
# See, for example: | |
# http://en.wikipedia.org/wiki/Singular_value_decomposition | |
# http://en.wikipedia.org/wiki/Principal_component_analysis | |
# | |
# Use the scipy library | |
from matplotlib.pyplot import * | |
from scipy import linalg, mat, dot | |
import numpy as np | |
######################## | |
# Demo Part 1 | |
######################## | |
# SVD: decompose a np.matrix X into product of unitary np.matrix (U), diagonal | |
# np.matrix (S) and vector V, a rotation, (if it helps, think: | |
# basis vectors, scaling, rotation) such that X.T = W x S x V_t. | |
# | |
# In machine learning, X will be a np.matrix of samples (4, rows) | |
# and features (3, columns) for our learning examples. | |
X_t = mat([[1, 3, -10], [0, 2, 1], [-1, -3, 9], [0, -2, 0]]) | |
X = X_t.T | |
# The scipy library makes this step easy | |
(W, s, V_t) = linalg.svd(X) | |
S = linalg.diagsvd(s, len(X), len(V_t)) | |
recon = dot(dot(W, S), V_t) | |
# Are these equal (to within rounding)? | |
abs(X - recon) | |
# maximum error | |
np.max(abs(X - recon)) | |
# One key to understanding nad moving to PCA is | |
# understanding the diagonal np.matrix S | |
S | |
# Given that the features have zero-mean: | |
[np.mean(i) for i in X] | |
# s is ordered vector that tells us the "significance" of each dimension | |
# in the rotated space. (Yes, I arranged it that way. To be clear, | |
# you can do SVD on a matrix without zero-mean features, but the dimension | |
# reduction part we are about to do requires it.) | |
# | |
# We can now selectively set lower values of s to | |
# zero to reduce dimension. | |
s_red = s | |
s_red[2] = 0 | |
# Approximately reconstruct our original matrix, but with | |
# a reduced-dimension representation | |
S_red = linalg.diagsvd(s_red, len(X), len(V_t)) | |
S_red | |
recon_red = dot(dot(W, S_red), V_t) | |
abs(X - recon_red) | |
# maximum error | |
np.max(abs(X - recon_red)) | |
# ratio of errors | |
np.max(abs(X - recon)) / np.max(abs(X - recon_red)) | |
# We "lost" 14 orders of magnitude in precision of the reconstruction, but this turns | |
# out to be okay for some machine learning problems. | |
######################## | |
# Demo Part 2 | |
######################## | |
from scikits.learn import linear_model | |
import scipy | |
from copy import copy | |
# Classification problem where points are linearly classifiable | |
# in 2 dim. | |
N = 200 | |
x = np.random.normal(0, 4, N) | |
y1 = 3 * x + np.random.normal(3, 1, N) | |
y2 = 3 * x + np.random.normal(-3, 1, N) | |
y_avg = np.mean(np.append(y1, y2, 1)) | |
figure() | |
plot(x, y1 - y_avg, 'bo') | |
plot(x, y2 - y_avg, 'ro') | |
title('Original data sets (0-average)') | |
# features x and y are rows in this np.matrix | |
X = np.append([x, y1 - y_avg], [x, y2 - y_avg], 1) | |
X_t = X.T | |
# y1 group 0; y2 group 1 | |
truth = np.append(scipy.ones([1, N]), scipy.zeros([1, N]), 1) | |
# 2d model works very well | |
lr = linear_model.LogisticRegression() | |
lr.fit(np.asarray(X_t), truth[0]) | |
lr.score(np.asarray(X_t), truth[0]) | |
lr.predict(np.asarray(X_t)) | |
# Now try dimension reduciton with SVD | |
(W, s, V_t) = linalg.svd(X) | |
s | |
# both transformed representations are the same before trimming: | |
S = linalg.diagsvd(s, len(X), len(V_t)) | |
np.max(abs(X.T * np.matrix(W) - np.matrix(V_t).T * np.matrix(S).T)) | |
# Now work with the transformed coordinates. It might not have been clear | |
# from above what the transformed coordinate system was. We can get there | |
# by either | |
X_prime = np.matrix(V_t).T * np.matrix(S).T | |
x_prime = np.asarray(X_prime.T[0]) | |
y_prime = np.asarray(X_prime.T[1]) | |
figure() | |
plot(x_prime, y_prime, 'go') | |
title('Freatures after SVD Transformation') | |
# Linearly classifiable in 1-d? Try all new basis directions (extremes of variation) | |
# Min variation - Training along y-dim nearly perfect | |
ypt = np.asarray(y_prime.T) | |
lr.fit(ypt, truth[0]) | |
lr.score(ypt, truth[0]) | |
lr.predict(ypt) | |
# Max variation - Nothing here | |
xpt = np.asarray(x_prime.T) | |
lr.fit(xpt, truth[0]) | |
lr.score(xpt, truth[0]) | |
lr.predict(xpt) | |
######################## | |
# Demo Part 3 | |
######################## | |
# Use PCA idea to reduce to 1-D | |
s_red = copy(s) | |
s_red[1] = 0 | |
S_red = linalg.diagsvd(s_red, len(X), len(V_t)) | |
X_prime = np.matrix(V_t).T * np.matrix(S_red).T | |
x_prime = np.asarray(X_prime.T[0]) | |
y_prime = np.asarray(X_prime.T[1]) | |
figure() | |
plot(x_prime, y_prime, 'yo') | |
title('Reduce S by removing s[1] = %2.5f' % s[1]) | |
# Try all new basis directions (extremes of variation) | |
# 1-D: Max variation - Training along x-dim performs poorly | |
ypt = np.asarray(y_prime.T) | |
lr.fit(ypt, truth[0]) | |
lr.score(ypt, truth[0]) | |
lr.predict(ypt) | |
# This is the other extrema to "principal" components | |
s_red = copy(s) | |
s_red[0] = 0 | |
S_red = linalg.diagsvd(s_red, len(X), len(V_t)) | |
X_prime = np.matrix(V_t).T * np.matrix(S_red).T | |
x_prime = np.asarray(X_prime.T[0]) | |
y_prime = np.asarray(X_prime.T[1]) | |
figure() | |
plot(x_prime, y_prime, 'mo') | |
title('Reduce S by removing value s[0] = %2.5f' % s[0]) | |
# 1-D: Min variation - Training along y-dim nearly perfect | |
ypt = np.asarray(y_prime.T) | |
lr.fit(ypt, truth[0]) | |
lr.score(ypt, truth[0]) | |
lr.predict(ypt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment