Skip to content

Instantly share code, notes, and snippets.

@Mashimo
Last active July 21, 2020 16:57
Show Gist options
  • Save Mashimo/69f0972d51358d65f088a7147dfc5ff1 to your computer and use it in GitHub Desktop.
Save Mashimo/69f0972d51358d65f088a7147dfc5ff1 to your computer and use it in GitHub Desktop.
PCA - Principal component Analysis
Principal Component Analysis

Python examples of Principal Component Analysis

  • PCA_armadillo: From 3D rendering to 2D plot
  • PCA_kidney: reduce the dense kidney clinic study feature set to its two main components
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import datetime
from plyfile import PlyData
from sklearn.decomposition import PCA
"""
The goal is to reduce the dimensionality of a 3D scan from three to two using PCA to cast a shadow of the data onto its two most
important principal components. Then render the resulting 2D scatter plot.
The scan is a real life armadillo sculpture scanned using a Cyberware 3030 MS 3D scanner at Stanford University.
The sculpture is available as part of their 3D Scanning Repository, and is a very dense 3D mesh consisting of 172974 vertices!
The PLY file is available at https://graphics.stanford.edu/data/3Dscanrep/
"""
# Every 100 data samples, we save 1. If things run too
# slow, try increasing this number. If things run too fast,
# try decreasing it... =)
reduce_factor = 100
# Look pretty...
matplotlib.style.use('ggplot')
# Load up the scanned armadillo
plyfile = PlyData.read('Datasets/stanford_armadillo.ply')
armadillo = pd.DataFrame({
'x':plyfile['vertex']['z'][::reduce_factor],
'y':plyfile['vertex']['x'][::reduce_factor],
'z':plyfile['vertex']['y'][::reduce_factor]
})
def do_PCA(armadillo):
#
# import the libraries required for PCA.
# Then, train your PCA on the armadillo dataframe. Finally,
# drop one dimension (reduce it down to 2D) and project the
# armadillo down to the 2D principal component feature space.
#
pca = PCA(n_components=2)
pca.fit(armadillo)
PCA(copy=True, n_components=2, whiten=False)
reducedArmadillo = pca.transform(armadillo)
return reducedArmadillo
def do_RandomizedPCA(armadillo):
#
# import the libraries required for
# RandomizedPCA. Then, train your RandomizedPCA on the armadillo
# dataframe. Finally, drop one dimension (reduce it down to 2D)
# and project the armadillo down to the 2D principal component
# feature space.
#
#
# NOTE: SKLearn deprecated the RandomizedPCA method, but still
# has instructions on how to use randomized (truncated) method
# for the SVD solver. To find out how to use it, check out the
# full docs here:
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
#
pca = PCA(n_components=2, svd_solver='randomized')
pca.fit(armadillo)
PCA(copy=True, n_components=2, whiten=False)
reducedArmadillo = pca.transform(armadillo)
return reducedArmadillo
# Render the Original Armadillo
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_title('Armadillo 3D')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.scatter(armadillo.x, armadillo.y, armadillo.z, c='green', marker='.', alpha=0.75)
# Time the execution of PCA 5000x
# PCA is ran 5000x in order to help decrease the potential of rogue
# processes altering the speed of execution.
t1 = datetime.datetime.now()
for i in range(5000): pca = do_PCA(armadillo)
time_delta = datetime.datetime.now() - t1
# Render the newly transformed PCA armadillo!
if not pca is None:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('PCA, build time: ' + str(time_delta))
ax.scatter(pca[:,0], pca[:,1], c='blue', marker='.', alpha=0.75)
# Time the execution of rPCA 5000x
t1 = datetime.datetime.now()
for i in range(5000): rpca = do_RandomizedPCA(armadillo)
time_delta = datetime.datetime.now() - t1
# Render the newly transformed RandomizedPCA armadillo!
if not rpca is None:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('RandomizedPCA, build time: ' + str(time_delta))
ax.scatter(rpca[:,0], rpca[:,1], c='red', marker='.', alpha=0.75)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import PCA_kidney_helper as helper
from sklearn.decomposition import PCA
"""
The UCI's Chronic Kidney Disease data set is a collection of samples taken from patients in India over a two month period,
some of whom were in the early stages of the disease. Available here: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
Reduce the dataset to two principal components by running it through PCA.
"""
# Look pretty...
plt.style.use('ggplot')
scaleFeatures = True
# Load up the dataset and remove any and all
# Rows that have a nan.
#
#
df = pd.read_csv("Datasets/kidney_disease.csv", index_col='id')
df.dropna(inplace=True)
# Create some color coded labels; the actual label feature
# will be removed prior to executing PCA, since it's unsupervised.
# You're only labeling by color so you can see the effects of PCA
labelsCol = ['red' if i=='ckd' else 'green' for i in df.classification]
# Get rid of nominal columns
#
df.drop(labels=['classification'], axis=1, inplace=True)
df = pd.get_dummies(df,columns=['rbc', 'pc', 'pcc', 'ba', 'htn',
'dm', 'cad', 'appet', 'pe', 'ane'])
print(df.dtypes)
"""
After adding in all the other features and properly encoding those that
need to be, you should notice that your separation between CKD and Non-CKD
patients increases significantly: by taking many secondary features and combining them, machine
learning is usually able to come up with more informative descriptions of
your data.
"""
# Print out and check your dataframe's dtypes.
#
# You can either take a look at the dataset webpage in the attribute info
# section: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
# or you can actually peek through the dataframe by printing a few rows.
#
df.wc = pd.to_numeric(df.wc, errors='coerce')
df.rc = pd.to_numeric(df.rc, errors='coerce')
df.pcv = pd.to_numeric(df.pcv, errors='coerce')
print(df.dtypes)
# PCA Operates based on variance. The variable with the greatest
# variance will dominate. Go ahead and peek into your data using a
# command that will check the variance of every feature in your dataset.
# Print out the results. Also print out the results of running .describe
# on your dataset.
#
#
print(df.describe())
if scaleFeatures: df = helper.scaleFeatures(df)
# Run PCA on your dataset and reduce it to 2 components
#
pca = PCA(n_components=2)
pca.fit(df)
PCA(copy=True, n_components=2, whiten=False)
T = pca.transform(df)
# Plot the transformed data as a scatter plot. Recall that transforming
# the data will result in a NumPy NDArray. You can either use MatPlotLib
# to graph it directly, or you can convert it to DataFrame and have pandas
# do it for you.
#
# Since we transformed via PCA, we no longer have column names. We know we
# are in P.C. space, so we'll just define the coordinates accordingly:
ax = helper.drawVectors(T, pca.components_, df.columns.values, plt, scaleFeatures)
T = pd.DataFrame(T)
T.columns = ['component1', 'component2']
T.plot.scatter(x='component1', y='component2', marker='o', c=labelsCol, alpha=0.75, ax=ax)
plt.show()
import math
import pandas as pd
from sklearn import preprocessing
# A Note on SKLearn .transform() calls:
#
# Any time you transform your data, you lose the column header names.
# This actually makes complete sense. There are essentially two types
# of transformations, those that change the scale of your features,
# and those that change your features entire. Changing the scale would
# be like changing centimeters to inches. Changing the features would
# be like using PCA to reduce 300 columns to 30. In either case, the
# original column's units have been altered or no longer exist, so it's
# up to you to rename your columns after ANY transformation. Due to
# this, SKLearn returns an NDArray from *transform() calls.
def scaleFeatures(df):
# SKLearn has many different methods for doing transforming your
# features by scaling them (this is a type of pre-processing).
# RobustScaler, Normalizer, MinMaxScaler, MaxAbsScaler, StandardScaler...
# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
#
# However in order to be effective at PCA, there are a few requirements
# that must be met, and which will drive the selection of your scaler.
# PCA required your data is standardized -- in other words it's mean is
# equal to 0, and it has ~unit variance.
#
# SKLearn's regular Normalizer doesn't zero out the mean of your data,
# it only clamps it, so it's inappropriate to use here (depending on
# your data). MinMaxScaler and MaxAbsScaler both fail to set a unit
# variance, so you won't be using them either. RobustScaler can work,
# again depending on your data (watch for outliers). For these reasons
# we're going to use the StandardScaler. Get familiar with it by visiting
# these two websites:
#
# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler
#
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
#
# ---------
# Feature scaling is the type of transformation that only changes the
# scale and not number of features, so we'll use the original dataset
# column names. However we'll keep in mind that the _units_ have been
# altered:
scaled = preprocessing.StandardScaler().fit_transform(df)
scaled = pd.DataFrame(scaled, columns=df.columns)
print ("New Variances:\n", scaled.var())
print ("New Describe:\n", scaled.describe())
return scaled
def drawVectors(transformed_features, components_, columns, plt, scaled):
if not scaled:
return plt.axes() # No cheating ;-)
num_columns = len(columns)
# This funtion will project your *original* feature (columns)
# onto your principal component feature-space, so that you can
# visualize how "important" each one was in the
# multi-dimensional scaling
# Scale the principal components by the max value in
# the transformed set belonging to that component
xvector = components_[0] * max(transformed_features[:,0])
yvector = components_[1] * max(transformed_features[:,1])
## visualize projections
# Sort each column by it's length. These are your *original*
# columns, not the principal components.
important_features = { columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(num_columns) }
important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True)
print ("Features by importance:\n", important_features)
ax = plt.axes()
for i in range(num_columns):
# Use an arrow to project each original feature as a
# labeled vector on your principal component axes
plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75)
plt.text(xvector[i]*1.2, yvector[i]*1.2, list(columns)[i], color='b', alpha=0.75)
return ax
@Abalo-Katanga
Copy link

cool tkants

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment