DrSkippy/dimReduction_SVD-PCA.py Secret

## dimReduction_SVD-PCA.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# SEE http://blog.drskippy.net/ for writeup
# Demonstrate simple problem of machine learning and dimension
# reduction with Principle Component Analysis (PCA).   Use Singular Value
# Decomposition (SVD) to make this as understandable as possible.
#
# Scott Hendrickson    2012-05-11
#
# Many good references on the mathematics behind SVD and PCA. There are also
# efficient algorithms for calculating the principle components without
# explicitly solving the SVD problem.
#
# See, for example:
#   http://en.wikipedia.org/wiki/Singular_value_decomposition
#   http://en.wikipedia.org/wiki/Principal_component_analysis
#
# Use the scipy library

from matplotlib.pyplot import *
from scipy import linalg, mat, dot
import numpy as np

########################
# Demo Part 1
########################
# SVD: decompose a np.matrix X into product of unitary np.matrix (U), diagonal
#      np.matrix (S) and vector V, a rotation, (if it helps, think:
#      basis vectors, scaling, rotation) such that X.T = W x S x V_t.
#
# In machine learning, X will be a np.matrix of samples (4, rows)
# and features (3, columns) for our learning examples.

X_t = mat([[1, 3, -10], [0, 2, 1], [-1, -3, 9], [0, -2, 0]])
X = X_t.T

# The scipy library makes this step easy

(W, s, V_t) = linalg.svd(X)
S = linalg.diagsvd(s, len(X), len(V_t))
recon = dot(dot(W, S), V_t)

# Are these equal (to within rounding)?

abs(X - recon)

# maximum error

np.max(abs(X - recon))

# One key to understanding nad moving to PCA is
# understanding the diagonal np.matrix S

S

# Given that the features have zero-mean:

[np.mean(i) for i in X]

# s is ordered vector that tells us the "significance" of each dimension
# in the rotated space. (Yes, I arranged it that way. To be clear,
# you can do SVD on a matrix without zero-mean features, but the dimension
# reduction part we are about to do requires it.)
#
# We can now selectively set lower values of s to
# zero to reduce dimension.

s_red = s
s_red[2] = 0

# Approximately reconstruct our original matrix, but with
# a reduced-dimension representation

S_red = linalg.diagsvd(s_red, len(X), len(V_t))
S_red
recon_red = dot(dot(W, S_red), V_t)
abs(X - recon_red)

# maximum error

np.max(abs(X - recon_red))

# ratio of errors

np.max(abs(X - recon)) / np.max(abs(X - recon_red))

# We "lost" 14 orders of magnitude in precision of the reconstruction, but this turns
# out to be okay for some machine learning problems.
########################
# Demo Part 2
########################

from scikits.learn import linear_model
import scipy
from copy import copy

# Classification problem where points are linearly classifiable
# in 2 dim.

N = 200
x = np.random.normal(0, 4, N)
y1 = 3 * x + np.random.normal(3, 1, N)
y2 = 3 * x + np.random.normal(-3, 1, N)
y_avg = np.mean(np.append(y1, y2, 1))
figure()
plot(x, y1 - y_avg, 'bo')
plot(x, y2 - y_avg, 'ro')
title('Original data sets (0-average)')

# features x and y are rows in this np.matrix

X = np.append([x, y1 - y_avg], [x, y2 - y_avg], 1)
X_t = X.T

# y1 group 0; y2 group 1

truth = np.append(scipy.ones([1, N]), scipy.zeros([1, N]), 1)

# 2d model works very well

lr = linear_model.LogisticRegression()
lr.fit(np.asarray(X_t), truth[0])
lr.score(np.asarray(X_t), truth[0])
lr.predict(np.asarray(X_t))

# Now try dimension reduciton with SVD

(W, s, V_t) = linalg.svd(X)
s

# both transformed representations are the same before trimming:

S = linalg.diagsvd(s, len(X), len(V_t))
np.max(abs(X.T * np.matrix(W) - np.matrix(V_t).T * np.matrix(S).T))

# Now work with the transformed coordinates.  It might not have been clear
# from above what the transformed coordinate system was. We can get there
# by either

X_prime = np.matrix(V_t).T * np.matrix(S).T
x_prime = np.asarray(X_prime.T[0])
y_prime = np.asarray(X_prime.T[1])
figure()
plot(x_prime, y_prime, 'go')
title('Freatures after SVD Transformation')

# Linearly classifiable in 1-d? Try all new basis directions (extremes of variation)
# Min variation - Training along y-dim nearly perfect

ypt = np.asarray(y_prime.T)
lr.fit(ypt, truth[0])
lr.score(ypt, truth[0])
lr.predict(ypt)

# Max variation - Nothing here

xpt = np.asarray(x_prime.T)
lr.fit(xpt, truth[0])
lr.score(xpt, truth[0])
lr.predict(xpt)

########################
# Demo Part 3
########################
# Use PCA idea to reduce to 1-D

s_red = copy(s)
s_red[1] = 0
S_red = linalg.diagsvd(s_red, len(X), len(V_t))
X_prime = np.matrix(V_t).T * np.matrix(S_red).T
x_prime = np.asarray(X_prime.T[0])
y_prime = np.asarray(X_prime.T[1])
figure()
plot(x_prime, y_prime, 'yo')
title('Reduce S by removing s[1] = %2.5f' % s[1])

# Try all new basis directions (extremes of variation)
# 1-D: Max variation - Training along x-dim performs poorly

ypt = np.asarray(y_prime.T)
lr.fit(ypt, truth[0])
lr.score(ypt, truth[0])
lr.predict(ypt)

# This is the other extrema to "principal" components

s_red = copy(s)
s_red[0] = 0
S_red = linalg.diagsvd(s_red, len(X), len(V_t))
X_prime = np.matrix(V_t).T * np.matrix(S_red).T
x_prime = np.asarray(X_prime.T[0])
y_prime = np.asarray(X_prime.T[1])
figure()
plot(x_prime, y_prime, 'mo')
title('Reduce S by removing value s[0] = %2.5f' % s[0])

# 1-D: Min variation - Training along y-dim nearly perfect

ypt = np.asarray(y_prime.T)
lr.fit(ypt, truth[0])
lr.score(ypt, truth[0])
lr.predict(ypt)
	#!/usr/bin/python
	# -- coding: utf-8 --
	# SEE http://blog.drskippy.net/ for writeup
	# Demonstrate simple problem of machine learning and dimension
	# reduction with Principle Component Analysis (PCA). Use Singular Value
	# Decomposition (SVD) to make this as understandable as possible.
	#
	# Scott Hendrickson 2012-05-11
	#
	# Many good references on the mathematics behind SVD and PCA. There are also
	# efficient algorithms for calculating the principle components without
	# explicitly solving the SVD problem.
	#
	# See, for example:
	# http://en.wikipedia.org/wiki/Singular_value_decomposition
	# http://en.wikipedia.org/wiki/Principal_component_analysis
	#
	# Use the scipy library

	from matplotlib.pyplot import *
	from scipy import linalg, mat, dot
	import numpy as np

	########################
	# Demo Part 1
	########################
	# SVD: decompose a np.matrix X into product of unitary np.matrix (U), diagonal
	# np.matrix (S) and vector V, a rotation, (if it helps, think:
	# basis vectors, scaling, rotation) such that X.T = W x S x V_t.
	#
	# In machine learning, X will be a np.matrix of samples (4, rows)
	# and features (3, columns) for our learning examples.

	X_t = mat([[1, 3, -10], [0, 2, 1], [-1, -3, 9], [0, -2, 0]])
	X = X_t.T

	# The scipy library makes this step easy

	(W, s, V_t) = linalg.svd(X)
	S = linalg.diagsvd(s, len(X), len(V_t))
	recon = dot(dot(W, S), V_t)

	# Are these equal (to within rounding)?

	abs(X - recon)

	# maximum error

	np.max(abs(X - recon))

	# One key to understanding nad moving to PCA is
	# understanding the diagonal np.matrix S

	S

	# Given that the features have zero-mean:

	[np.mean(i) for i in X]

	# s is ordered vector that tells us the "significance" of each dimension
	# in the rotated space. (Yes, I arranged it that way. To be clear,
	# you can do SVD on a matrix without zero-mean features, but the dimension
	# reduction part we are about to do requires it.)
	#
	# We can now selectively set lower values of s to
	# zero to reduce dimension.

	s_red = s
	s_red[2] = 0

	# Approximately reconstruct our original matrix, but with
	# a reduced-dimension representation

	S_red = linalg.diagsvd(s_red, len(X), len(V_t))
	S_red
	recon_red = dot(dot(W, S_red), V_t)
	abs(X - recon_red)

	# maximum error

	np.max(abs(X - recon_red))

	# ratio of errors

	np.max(abs(X - recon)) / np.max(abs(X - recon_red))

	# We "lost" 14 orders of magnitude in precision of the reconstruction, but this turns
	# out to be okay for some machine learning problems.
	########################
	# Demo Part 2
	########################

	from scikits.learn import linear_model
	import scipy
	from copy import copy

	# Classification problem where points are linearly classifiable
	# in 2 dim.

	N = 200
	x = np.random.normal(0, 4, N)
	y1 = 3 * x + np.random.normal(3, 1, N)
	y2 = 3 * x + np.random.normal(-3, 1, N)
	y_avg = np.mean(np.append(y1, y2, 1))
	figure()
	plot(x, y1 - y_avg, 'bo')
	plot(x, y2 - y_avg, 'ro')
	title('Original data sets (0-average)')

	# features x and y are rows in this np.matrix

	X = np.append([x, y1 - y_avg], [x, y2 - y_avg], 1)
	X_t = X.T

	# y1 group 0; y2 group 1

	truth = np.append(scipy.ones([1, N]), scipy.zeros([1, N]), 1)

	# 2d model works very well

	lr = linear_model.LogisticRegression()
	lr.fit(np.asarray(X_t), truth[0])
	lr.score(np.asarray(X_t), truth[0])
	lr.predict(np.asarray(X_t))

	# Now try dimension reduciton with SVD

	(W, s, V_t) = linalg.svd(X)
	s

	# both transformed representations are the same before trimming:

	S = linalg.diagsvd(s, len(X), len(V_t))
	np.max(abs(X.T * np.matrix(W) - np.matrix(V_t).T * np.matrix(S).T))

	# Now work with the transformed coordinates. It might not have been clear
	# from above what the transformed coordinate system was. We can get there
	# by either

	X_prime = np.matrix(V_t).T * np.matrix(S).T
	x_prime = np.asarray(X_prime.T[0])
	y_prime = np.asarray(X_prime.T[1])
	figure()
	plot(x_prime, y_prime, 'go')
	title('Freatures after SVD Transformation')

	# Linearly classifiable in 1-d? Try all new basis directions (extremes of variation)
	# Min variation - Training along y-dim nearly perfect

	ypt = np.asarray(y_prime.T)
	lr.fit(ypt, truth[0])
	lr.score(ypt, truth[0])
	lr.predict(ypt)

	# Max variation - Nothing here

	xpt = np.asarray(x_prime.T)
	lr.fit(xpt, truth[0])
	lr.score(xpt, truth[0])
	lr.predict(xpt)

	########################
	# Demo Part 3
	########################
	# Use PCA idea to reduce to 1-D

	s_red = copy(s)
	s_red[1] = 0
	S_red = linalg.diagsvd(s_red, len(X), len(V_t))
	X_prime = np.matrix(V_t).T * np.matrix(S_red).T
	x_prime = np.asarray(X_prime.T[0])
	y_prime = np.asarray(X_prime.T[1])
	figure()
	plot(x_prime, y_prime, 'yo')
	title('Reduce S by removing s[1] = %2.5f' % s[1])

	# Try all new basis directions (extremes of variation)
	# 1-D: Max variation - Training along x-dim performs poorly

	ypt = np.asarray(y_prime.T)
	lr.fit(ypt, truth[0])
	lr.score(ypt, truth[0])
	lr.predict(ypt)

	# This is the other extrema to "principal" components

	s_red = copy(s)
	s_red[0] = 0
	S_red = linalg.diagsvd(s_red, len(X), len(V_t))
	X_prime = np.matrix(V_t).T * np.matrix(S_red).T
	x_prime = np.asarray(X_prime.T[0])
	y_prime = np.asarray(X_prime.T[1])
	figure()
	plot(x_prime, y_prime, 'mo')
	title('Reduce S by removing value s[0] = %2.5f' % s[0])

	# 1-D: Min variation - Training along y-dim nearly perfect

	ypt = np.asarray(y_prime.T)
	lr.fit(ypt, truth[0])
	lr.score(ypt, truth[0])
	lr.predict(ypt)