Mashimo/ PCA

## PCA
Principal Component Analysis

## PCA-README.md

      
    Raw
  

              PCA-README.md
            
          
    Python examples of Principal Component Analysis


PCA_armadillo: From 3D rendering to 2D plot
PCA_kidney: reduce the dense kidney clinic study feature set to its two main components


## PCA_armadillo.py
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import datetime

from plyfile import PlyData

from sklearn.decomposition import PCA
"""
The goal is to reduce the dimensionality of a 3D scan from three to two using PCA to cast a shadow of the data onto its two most
important principal components. Then render the resulting 2D scatter plot.
The scan is a real life armadillo sculpture scanned using a Cyberware 3030 MS 3D scanner at Stanford University.
The sculpture is available as part of their 3D Scanning Repository, and is a very dense 3D mesh consisting of 172974 vertices!
The PLY file is available at https://graphics.stanford.edu/data/3Dscanrep/
"""

# Every 100 data samples, we save 1. If things run too
# slow, try increasing this number. If things run too fast,
# try decreasing it... =)
reduce_factor = 100


# Look pretty...
matplotlib.style.use('ggplot')


# Load up the scanned armadillo
plyfile = PlyData.read('Datasets/stanford_armadillo.ply')
armadillo = pd.DataFrame({
  'x':plyfile['vertex']['z'][::reduce_factor],
  'y':plyfile['vertex']['x'][::reduce_factor],
  'z':plyfile['vertex']['y'][::reduce_factor]
})


def do_PCA(armadillo):
  #
  # import the libraries required for PCA.
  # Then, train your PCA on the armadillo dataframe. Finally,
  # drop one dimension (reduce it down to 2D) and project the
  # armadillo down to the 2D principal component feature space.
  #

  pca = PCA(n_components=2)
  pca.fit(armadillo)
  PCA(copy=True, n_components=2, whiten=False)

  reducedArmadillo = pca.transform(armadillo)

  return reducedArmadillo


def do_RandomizedPCA(armadillo):
  #
  # import the libraries required for
  # RandomizedPCA. Then, train your RandomizedPCA on the armadillo
  # dataframe. Finally, drop one dimension (reduce it down to 2D)
  # and project the armadillo down to the 2D principal component
  # feature space.
  #
  #
  # NOTE: SKLearn deprecated the RandomizedPCA method, but still
  # has instructions on how to use randomized (truncated) method
  # for the SVD solver. To find out how to use it, check out the
  # full docs here:
  # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
  #

  pca = PCA(n_components=2, svd_solver='randomized')
  pca.fit(armadillo)
  PCA(copy=True, n_components=2, whiten=False)

  reducedArmadillo = pca.transform(armadillo)

  return reducedArmadillo


# Render the Original Armadillo
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_title('Armadillo 3D')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.scatter(armadillo.x, armadillo.y, armadillo.z, c='green', marker='.', alpha=0.75)


# Time the execution of PCA 5000x
# PCA is ran 5000x in order to help decrease the potential of rogue
# processes altering the speed of execution.
t1 = datetime.datetime.now()
for i in range(5000): pca = do_PCA(armadillo)
time_delta = datetime.datetime.now() - t1

# Render the newly transformed PCA armadillo!
if not pca is None:
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.set_title('PCA, build time: ' + str(time_delta))
  ax.scatter(pca[:,0], pca[:,1], c='blue', marker='.', alpha=0.75)


# Time the execution of rPCA 5000x
t1 = datetime.datetime.now()
for i in range(5000): rpca = do_RandomizedPCA(armadillo)
time_delta = datetime.datetime.now() - t1

# Render the newly transformed RandomizedPCA armadillo!
if not rpca is None:
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.set_title('RandomizedPCA, build time: ' + str(time_delta))
  ax.scatter(rpca[:,0], rpca[:,1], c='red', marker='.', alpha=0.75)


plt.show()


## PCA_kidney.py
import pandas as pd
import matplotlib.pyplot as plt
import PCA_kidney_helper as helper

from sklearn.decomposition import PCA

"""
The UCI's Chronic Kidney Disease data set is a collection of samples taken from patients in India over a two month period,
some of whom were in the early stages of the disease. Available here: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
Reduce the dataset to two principal components by running it through PCA.
"""


# Look pretty...
plt.style.use('ggplot')


scaleFeatures = True


# Load up the dataset and remove any and all
# Rows that have a nan.
#
#

df = pd.read_csv("Datasets/kidney_disease.csv", index_col='id')
df.dropna(inplace=True)

# Create some color coded labels; the actual label feature
# will be removed prior to executing PCA, since it's unsupervised.
# You're only labeling by color so you can see the effects of PCA
labelsCol = ['red' if i=='ckd' else 'green' for i in df.classification]


# Get rid of nominal columns
#

df.drop(labels=['classification'], axis=1, inplace=True)

df = pd.get_dummies(df,columns=['rbc', 'pc', 'pcc', 'ba', 'htn',
                                'dm', 'cad', 'appet', 'pe', 'ane'])

print(df.dtypes)

"""
After adding in all the other features and properly encoding those that
need to be, you should notice that your separation between CKD and Non-CKD
patients increases significantly: by taking many secondary features and combining them, machine
 learning is usually able to come up with more informative descriptions of
 your data.
"""


# Print out and check your dataframe's dtypes.
#
# You can either take a look at the dataset webpage in the attribute info
# section: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
# or you can actually peek through the dataframe by printing a few rows.
#
df.wc = pd.to_numeric(df.wc, errors='coerce')
df.rc = pd.to_numeric(df.rc, errors='coerce')
df.pcv = pd.to_numeric(df.pcv, errors='coerce')
print(df.dtypes)


# PCA Operates based on variance. The variable with the greatest
# variance will dominate. Go ahead and peek into your data using a
# command that will check the variance of every feature in your dataset.
# Print out the results. Also print out the results of running .describe
# on your dataset.
#
#
print(df.describe())

if scaleFeatures: df = helper.scaleFeatures(df)


# Run PCA on your dataset and reduce it to 2 components
#

pca = PCA(n_components=2)
pca.fit(df)
PCA(copy=True, n_components=2, whiten=False)

T = pca.transform(df)

# Plot the transformed data as a scatter plot. Recall that transforming
# the data will result in a NumPy NDArray. You can either use MatPlotLib
# to graph it directly, or you can convert it to DataFrame and have pandas
# do it for you.
#
# Since we transformed via PCA, we no longer have column names. We know we
# are in P.C. space, so we'll just define the coordinates accordingly:

ax = helper.drawVectors(T, pca.components_, df.columns.values, plt, scaleFeatures)
T = pd.DataFrame(T)
T.columns = ['component1', 'component2']
T.plot.scatter(x='component1', y='component2', marker='o', c=labelsCol, alpha=0.75, ax=ax)
plt.show()


## PCA_kidney_helper.py
import math
import pandas as pd
from sklearn import preprocessing

# A Note on SKLearn .transform() calls:
#
# Any time you transform your data, you lose the column header names.
# This actually makes complete sense. There are essentially two types
# of transformations,  those that change the scale of your features,
# and those that change your features entire. Changing the scale would
# be like changing centimeters to inches. Changing the features would
# be like using PCA to reduce 300 columns to 30. In either case, the
# original column's units have been altered or no longer exist, so it's
# up to you to rename your columns after ANY transformation. Due to
# this, SKLearn returns an NDArray from *transform() calls.

def scaleFeatures(df):
  # SKLearn has many different methods for doing transforming your
  # features by scaling them (this is a type of pre-processing).
  # RobustScaler, Normalizer, MinMaxScaler, MaxAbsScaler, StandardScaler...
  # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
  #
  # However in order to be effective at PCA, there are a few requirements
  # that must be met, and which will drive the selection of your scaler.
  # PCA required your data is standardized -- in other words it's mean is
  # equal to 0, and it has ~unit variance.
  #
  # SKLearn's regular Normalizer doesn't zero out the mean of your data,
  # it only clamps it, so it's inappropriate to use here (depending on
  # your data). MinMaxScaler and MaxAbsScaler both fail to set a unit
  # variance, so you won't be using them either. RobustScaler can work,
  # again depending on your data (watch for outliers). For these reasons
  # we're going to use the StandardScaler. Get familiar with it by visiting
  # these two websites:
  #
  # http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler
  #
  # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
  #


  # ---------
  # Feature scaling is the type of transformation that only changes the
  # scale and not number of features, so we'll use the original dataset
  # column names. However we'll keep in mind that the _units_ have been
  # altered:
  scaled = preprocessing.StandardScaler().fit_transform(df)
  scaled = pd.DataFrame(scaled, columns=df.columns)
  print ("New Variances:\n", scaled.var())
  print ("New Describe:\n", scaled.describe())
  return scaled


def drawVectors(transformed_features, components_, columns, plt, scaled):
  if not scaled:
    return plt.axes() # No cheating ;-)

  num_columns = len(columns)

  # This funtion will project your *original* feature (columns)
  # onto your principal component feature-space, so that you can
  # visualize how "important" each one was in the
  # multi-dimensional scaling

  # Scale the principal components by the max value in
  # the transformed set belonging to that component
  xvector = components_[0] * max(transformed_features[:,0])
  yvector = components_[1] * max(transformed_features[:,1])

  ## visualize projections

  # Sort each column by it's length. These are your *original*
  # columns, not the principal components.
  important_features = { columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(num_columns) }
  important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True)
  print ("Features by importance:\n", important_features)

  ax = plt.axes()

  for i in range(num_columns):
    # Use an arrow to project each original feature as a
    # labeled vector on your principal component axes
    plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75)
    plt.text(xvector[i]*1.2, yvector[i]*1.2, list(columns)[i], color='b', alpha=0.75)

  return ax
	import pandas as pd
	import matplotlib.pyplot as plt
	import matplotlib
	import datetime

	from plyfile import PlyData

	from sklearn.decomposition import PCA
	"""
	The goal is to reduce the dimensionality of a 3D scan from three to two using PCA to cast a shadow of the data onto its two most
	important principal components. Then render the resulting 2D scatter plot.
	The scan is a real life armadillo sculpture scanned using a Cyberware 3030 MS 3D scanner at Stanford University.
	The sculpture is available as part of their 3D Scanning Repository, and is a very dense 3D mesh consisting of 172974 vertices!
	The PLY file is available at https://graphics.stanford.edu/data/3Dscanrep/
	"""

	# Every 100 data samples, we save 1. If things run too
	# slow, try increasing this number. If things run too fast,
	# try decreasing it... =)
	reduce_factor = 100


	# Look pretty...
	matplotlib.style.use('ggplot')


	# Load up the scanned armadillo
	plyfile = PlyData.read('Datasets/stanford_armadillo.ply')
	armadillo = pd.DataFrame({
	'x':plyfile['vertex']['z'][::reduce_factor],
	'y':plyfile['vertex']['x'][::reduce_factor],
	'z':plyfile['vertex']['y'][::reduce_factor]
	})


	def do_PCA(armadillo):
	#
	# import the libraries required for PCA.
	# Then, train your PCA on the armadillo dataframe. Finally,
	# drop one dimension (reduce it down to 2D) and project the
	# armadillo down to the 2D principal component feature space.
	#

	pca = PCA(n_components=2)
	pca.fit(armadillo)
	PCA(copy=True, n_components=2, whiten=False)

	reducedArmadillo = pca.transform(armadillo)

	return reducedArmadillo


	def do_RandomizedPCA(armadillo):
	#
	# import the libraries required for
	# RandomizedPCA. Then, train your RandomizedPCA on the armadillo
	# dataframe. Finally, drop one dimension (reduce it down to 2D)
	# and project the armadillo down to the 2D principal component
	# feature space.
	#
	#
	# NOTE: SKLearn deprecated the RandomizedPCA method, but still
	# has instructions on how to use randomized (truncated) method
	# for the SVD solver. To find out how to use it, check out the
	# full docs here:
	# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
	#

	pca = PCA(n_components=2, svd_solver='randomized')
	pca.fit(armadillo)
	PCA(copy=True, n_components=2, whiten=False)

	reducedArmadillo = pca.transform(armadillo)

	return reducedArmadillo


	# Render the Original Armadillo
	fig = plt.figure()
	ax = fig.add_subplot(111, projection='3d')
	ax.set_title('Armadillo 3D')
	ax.set_xlabel('X')
	ax.set_ylabel('Y')
	ax.set_zlabel('Z')
	ax.scatter(armadillo.x, armadillo.y, armadillo.z, c='green', marker='.', alpha=0.75)



	# Time the execution of PCA 5000x
	# PCA is ran 5000x in order to help decrease the potential of rogue
	# processes altering the speed of execution.
	t1 = datetime.datetime.now()
	for i in range(5000): pca = do_PCA(armadillo)
	time_delta = datetime.datetime.now() - t1

	# Render the newly transformed PCA armadillo!
	if not pca is None:
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.set_title('PCA, build time: ' + str(time_delta))
	ax.scatter(pca[:,0], pca[:,1], c='blue', marker='.', alpha=0.75)



	# Time the execution of rPCA 5000x
	t1 = datetime.datetime.now()
	for i in range(5000): rpca = do_RandomizedPCA(armadillo)
	time_delta = datetime.datetime.now() - t1

	# Render the newly transformed RandomizedPCA armadillo!
	if not rpca is None:
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.set_title('RandomizedPCA, build time: ' + str(time_delta))
	ax.scatter(rpca[:,0], rpca[:,1], c='red', marker='.', alpha=0.75)


	plt.show()
	import pandas as pd
	import matplotlib.pyplot as plt
	import PCA_kidney_helper as helper

	from sklearn.decomposition import PCA

	"""
	The UCI's Chronic Kidney Disease data set is a collection of samples taken from patients in India over a two month period,
	some of whom were in the early stages of the disease. Available here: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
	Reduce the dataset to two principal components by running it through PCA.
	"""


	# Look pretty...
	plt.style.use('ggplot')


	scaleFeatures = True


	# Load up the dataset and remove any and all
	# Rows that have a nan.
	#
	#

	df = pd.read_csv("Datasets/kidney_disease.csv", index_col='id')
	df.dropna(inplace=True)

	# Create some color coded labels; the actual label feature
	# will be removed prior to executing PCA, since it's unsupervised.
	# You're only labeling by color so you can see the effects of PCA
	labelsCol = ['red' if i=='ckd' else 'green' for i in df.classification]


	# Get rid of nominal columns
	#

	df.drop(labels=['classification'], axis=1, inplace=True)

	df = pd.get_dummies(df,columns=['rbc', 'pc', 'pcc', 'ba', 'htn',
	'dm', 'cad', 'appet', 'pe', 'ane'])

	print(df.dtypes)

	"""
	After adding in all the other features and properly encoding those that
	need to be, you should notice that your separation between CKD and Non-CKD
	patients increases significantly: by taking many secondary features and combining them, machine
	learning is usually able to come up with more informative descriptions of
	your data.
	"""


	# Print out and check your dataframe's dtypes.
	#
	# You can either take a look at the dataset webpage in the attribute info
	# section: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
	# or you can actually peek through the dataframe by printing a few rows.
	#
	df.wc = pd.to_numeric(df.wc, errors='coerce')
	df.rc = pd.to_numeric(df.rc, errors='coerce')
	df.pcv = pd.to_numeric(df.pcv, errors='coerce')
	print(df.dtypes)


	# PCA Operates based on variance. The variable with the greatest
	# variance will dominate. Go ahead and peek into your data using a
	# command that will check the variance of every feature in your dataset.
	# Print out the results. Also print out the results of running .describe
	# on your dataset.
	#
	#
	print(df.describe())

	if scaleFeatures: df = helper.scaleFeatures(df)



	# Run PCA on your dataset and reduce it to 2 components
	#

	pca = PCA(n_components=2)
	pca.fit(df)
	PCA(copy=True, n_components=2, whiten=False)

	T = pca.transform(df)

	# Plot the transformed data as a scatter plot. Recall that transforming
	# the data will result in a NumPy NDArray. You can either use MatPlotLib
	# to graph it directly, or you can convert it to DataFrame and have pandas
	# do it for you.
	#
	# Since we transformed via PCA, we no longer have column names. We know we
	# are in P.C. space, so we'll just define the coordinates accordingly:

	ax = helper.drawVectors(T, pca.components_, df.columns.values, plt, scaleFeatures)
	T = pd.DataFrame(T)
	T.columns = ['component1', 'component2']
	T.plot.scatter(x='component1', y='component2', marker='o', c=labelsCol, alpha=0.75, ax=ax)
	plt.show()
	import math
	import pandas as pd
	from sklearn import preprocessing

	# A Note on SKLearn .transform() calls:
	#
	# Any time you transform your data, you lose the column header names.
	# This actually makes complete sense. There are essentially two types
	# of transformations, those that change the scale of your features,
	# and those that change your features entire. Changing the scale would
	# be like changing centimeters to inches. Changing the features would
	# be like using PCA to reduce 300 columns to 30. In either case, the
	# original column's units have been altered or no longer exist, so it's
	# up to you to rename your columns after ANY transformation. Due to
	# this, SKLearn returns an NDArray from *transform() calls.

	def scaleFeatures(df):
	# SKLearn has many different methods for doing transforming your
	# features by scaling them (this is a type of pre-processing).
	# RobustScaler, Normalizer, MinMaxScaler, MaxAbsScaler, StandardScaler...
	# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
	#
	# However in order to be effective at PCA, there are a few requirements
	# that must be met, and which will drive the selection of your scaler.
	# PCA required your data is standardized -- in other words it's mean is
	# equal to 0, and it has ~unit variance.
	#
	# SKLearn's regular Normalizer doesn't zero out the mean of your data,
	# it only clamps it, so it's inappropriate to use here (depending on
	# your data). MinMaxScaler and MaxAbsScaler both fail to set a unit
	# variance, so you won't be using them either. RobustScaler can work,
	# again depending on your data (watch for outliers). For these reasons
	# we're going to use the StandardScaler. Get familiar with it by visiting
	# these two websites:
	#
	# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler
	#
	# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
	#


	# ---------
	# Feature scaling is the type of transformation that only changes the
	# scale and not number of features, so we'll use the original dataset
	# column names. However we'll keep in mind that the _units_ have been
	# altered:
	scaled = preprocessing.StandardScaler().fit_transform(df)
	scaled = pd.DataFrame(scaled, columns=df.columns)
	print ("New Variances:\n", scaled.var())
	print ("New Describe:\n", scaled.describe())
	return scaled


	def drawVectors(transformed_features, components_, columns, plt, scaled):
	if not scaled:
	return plt.axes() # No cheating ;-)

	num_columns = len(columns)

	# This funtion will project your original feature (columns)
	# onto your principal component feature-space, so that you can
	# visualize how "important" each one was in the
	# multi-dimensional scaling

	# Scale the principal components by the max value in
	# the transformed set belonging to that component
	xvector = components_[0] * max(transformed_features[:,0])
	yvector = components_[1] * max(transformed_features[:,1])

	## visualize projections

	# Sort each column by it's length. These are your original
	# columns, not the principal components.
	important_features = { columns[i] : math.sqrt(xvector[i]2 + yvector[i]2) for i in range(num_columns) }
	important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True)
	print ("Features by importance:\n", important_features)

	ax = plt.axes()

	for i in range(num_columns):
	# Use an arrow to project each original feature as a
	# labeled vector on your principal component axes
	plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75)
	plt.text(xvector[i]1.2, yvector[i]1.2, list(columns)[i], color='b', alpha=0.75)

	return ax