Skip to content

Instantly share code, notes, and snippets.

@easadler
Last active January 26, 2016 05:45
Show Gist options
  • Save easadler/5b13f6fc73e098a67097 to your computer and use it in GitHub Desktop.
Save easadler/5b13f6fc73e098a67097 to your computer and use it in GitHub Desktop.
Class: EDA & feature engineering using PCA

Class: Dimension Reduction Plot Components (DRPC)

I created this class to quickly implement versions of PCA and develope intuition through plotting and examining the principle componenents. The syntax follows scikit learn's philosophy, with a few modifications to improve the work flow for the specific uses of this class.

Getting started is very simple:

from reducedimensions import DRPC
X = df.values
names = df.columns
drpc = DRPC()
drpc.fit_transform(X, names=names)
drpc.plot_embedding()
print drpc.components()
  1. Instantiate the class using DRPC() (Dimension Reduction Plot Components).
  2. fit_transform(X, names=names) takes a two-dimensional numpy array and runs it through a pipeline, which scales, clusters columns (optional) and uses PCA with 3 components by default.
  • Customize PCA by inputing like so DRPC(model=PCA(n_componenet=5)
  • Sklearn's SparsePCA, SVD, and NFM also can be used if inputed to the model.
  1. Use plot_embedding() to plot the observations using the components created in fit_transform().
  2. Use componenets() to return a dataframe containing the principle components as columns and column names as rows. This useful for interpreting the meaning of each component, especially when using sparsePCA.

Please, look through the comments in the code to see all the functionality for the various methods. I will have an example Ipython Notebook up soon, showing how I used this class for EDA and feature Engineering. This class uses another class that I built to do column-wise hierachical clustering. Click here for information on how it works.

Namaste :)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from mpl_toolkits.mplot3d import Axes3D
from colwisecluster import CWHC
import psycopg2 as pg2
class DRPC(BaseEstimator, TransformerMixin):
"""Make 2d or 3d plots of principle components from scikit learn's PCA or
SparsePCA models. Option to use scikit learn's KMeans model to color observations.
Parameters
----------
df : dataframe
min_samples : int, optional
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by metrics.pairwise.calculate_distance for its
metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a sparse matrix, in which case only "nonzero"
elements may be considered neighbors for DBSCAN."""
def __init__(self, model=None, scaler=None, cluster_model=False, thresh=0.5):
'''
Set self.names and do preprocess data for dimension
reduction.
'''
if not model:
self.model = PCA(n_components=3)
else:
self.model = model
if not scaler:
self.scaler = StandardScaler()
else:
self.scaler = scaler
self.cluster_model = cluster_model
self.thresh = thresh
if cluster_model:
self.pipeline = Pipeline([('cluster_model', CWHC(thresh=0.5)), ('scaler', self.scaler), ('model', self.model)])
else:
self.pipeline = Pipeline([('scaler', self.scaler), ('model', self.model)])
def fit(self, X, names=None):
"""Fit the model with X.
Parameters
----------
X: array-like, shape (n_samples, n_features)
Training data, where n_samples in the number of samples
and n_features is the number of features.
Returns
-------
self : object
Returns the instance itself.
"""
if isinstance(X, pd.DataFrame):
X = X.values
if self.cluster_model and names:
self.pipeline.named_steps['cluster_model'].names = np.array(names)
elif names:
self.names = np.array(names)
self._fit(X)
return self
def _fit(self, X):
self.X_new = self.pipeline.fit_transform(X)
if self.cluster_model:
self.names = self.pipeline.named_steps['cluster_model'].new_names
return self.X_new
def fit_transform(self, X, names=None):
"""Fit the model with X and apply the dimensionality reduction on X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
if isinstance(X, pd.DataFrame):
X = X.values
if self.cluster_model and names:
self.pipeline.named_steps['cluster_model'].names = np.array(names)
elif names:
self.names = np.array(names)
self.X_new = self._fit(X)
return self.X_new
def transform(self, X):
"""Apply the dimensionality reduction on X.
X columns are grouped into the clusters previous extracted
from a training set.
Parameters
----------
X : array-like, shape (n_samples, n_features)
New data, where n_samples is the number of samples
and n_features is the number of features.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
if isinstance(X, pd.DataFrame):
X = X.values
if self.X_new:
self.X_new = self.pipeline.transform(X)
return self.X_new
def components(self):
"""
Extract the principle components in a dataframe with the column names as the index.
Returns
-------
df_c : dataframe, shape (n_names, n_components)
"""
self.df_c = pd.DataFrame(self.pipeline.named_steps['model'].components_.T, index=self.names, columns=range(1, self.pipeline.named_steps['model'].n_components + 1))
return self.df_c
def _cluster(self, n_cluster_list):
"""
Apply k-means clustering to the rows of X. Uses silhoette scores to determine the best
cluster from n_cluster_list. Used in plot_embedding for color encoding.
Parameters
----------
n_cluster_list : array, shape (len(n_cluster_list),)
List of integers, which will be used as n_clusters parameter in sklearn's KMeans algorithm.
Returns
-------
self.best : i, silhouette_avg, cluster_labels
A tuple of information on the best cluster.
"""
best = (0, 0, 0)
for i in n_cluster_list:
clusterer = KMeans(n_clusters=i)
cluster_labels = clusterer.fit_predict(self.X_new)
silhouette_avg = silhouette_score(self.X_new, cluster_labels)
if abs(silhouette_avg) > best[1]:
best = i, silhouette_avg, cluster_labels
print "For n_clusters =", i, "The average silhouette_score is :", silhouette_avg
self.best = best
return self.best
def plot_embedding(self, dimensions=2, row_names=None, figsize=(12, 12), name_lim=15, cluster_list = False, fontsize = 8):
"""
Plot 2D or 3D embeddings of the transformed data based on the principle components.
Parameters
----------
dimensions : int, 2 or 3
row_names : array, shape(self.X.shape[0],)
labels to be plotted for each observation
figsize : tuple, shape (2,)
name_lim : int
limit size of row_names
cluster_list : array of ints, (len(cluster_list),)
Apply sklearn's KMeans clustering to cluster and then color observation
fontsize: int
Returns
-------
"""
if cluster_list:
best = self._cluster(cluster_list)
y = best[2]
else:
y = np.zeros(self.X_new.shape[0])
if not row_names:
row_names = np.chararray(self.X_new.shape[0])
row_names[:] = '*'
X = self.X_new
if dimensions == 3:
fig = plt.figure(figsize=figsize, dpi=250)
ax = fig.add_subplot(111, projection='3d')
for i in range(X.shape[0]):
ax.text(X[i, 0], X[i, 1], X[i, 2], str(row_names[i][0:name_lim]), color=plt.cm.Set1(y[i] / 10.), fontsize=fontsize)
ax.set_xlim3d(X[:, 0].min(), X[:, 0].max())
ax.set_ylim3d(X[:, 1].min(), X[:, 1].max())
ax.set_zlim3d(X[:, 2].min(), X[:, 2].max())
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
elif dimensions == 2:
plt.figure(figsize=(12, 12), dpi=250)
ax = plt.subplot(111)
for i in range(X.shape[0]):
ax.text(X[i, 0], X[i, 1], str(self.names[i][0:name_lim]), color=plt.cm.Set1(y[i] / 10.), fontsize=fontsize)
ax.set_xlim(X[:, 0].min(), X[:, 0].max())
ax.set_ylim(X[:, 1].min(), X[:, 1].max())
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
plt.show()
if __name__ == '__main__':
# Connect to psql database
conn = pg2.connect(dbname='lastfm', user='evansadler', host='/tmp')
c = conn.cursor()
query = 'SELECT * FROM sample;'
df_t = pd.read_sql_query(query, conn)
df_piv = df_t.groupby(['userid', 'artist'])['plays'].mean().reset_index().pivot(index='userid', columns='artist', values='plays')
df_piv = df_piv[df_piv < 1000]
summary = df_piv.dropna(thresh=70, axis=1)
summary = summary.fillna(0)
names = list(summary.columns)
X = summary.values
# ss = StandardScaler()
# X = ss.fit_transform(X)
clf = DRPC(cluster_model=True)
print clf.fit_transform(X, names=names)
print clf.components()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment