Skip to content

Instantly share code, notes, and snippets.

@cnh
Last active November 21, 2017 06:49
Show Gist options
  • Save cnh/a75ba53b7ec2cde980dcab2a36ce63f0 to your computer and use it in GitHub Desktop.
Save cnh/a75ba53b7ec2cde980dcab2a36ce63f0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 21 08:29:57 2017
@author: tp53
"""
import pandas as pd
import numpy as np
#read data and data wrangling
#added a row for response
#metf = metformin, n = normalize, r= raw dat, from csv, before doing transpose(T) & other manipulations
metf_n_r_data = pd.read_csv("./data/metf_n_resp.csv")
thiog_n_data = pd.read_csv("./data/THIOG_normalized.csv")
res_csv = pd.read_csv('./data/response.csv')
res_data = res_csv['METFORMIN']
#build the target
target = res_data
#transpose metf_n_r_data, features in columns, samples(patient data) as rows
metf_n_r_t_data = metf_n_r_data.T
#drop the target column, and save rest of the sample data, as
#predictors = metf_n_data.drop(['response'], axis=1).as_matrix()
#create a copy of metf_n_r_t_data (its too long a name !!!, plus experiments) in df0
df0 = metf_n_r_t_data.copy(deep=True)
df_no_r = df0.drop(df0.columns[[0]], axis=1)
predictors = df_no_r.as_matrix()
'''
predictors.shape will fail
'''
preds_wo_labels = predictors[1:]
'''
cant do preds_wo_labels.head
as preds_wo_labels is not a dataframe object, but numpy.ndarray
'''
preds_wo_labels_df = pd.DataFrame(data=preds_wo_labels)
import numpy as np
preds_wo_labels = np.matrix(preds_wo_labels, dtype=float)
preds_wo_labels
cov_matrix = np.cov(preds_wo_labels)
'''
lets calculate eigenvalues and eigenvectors
'''
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
eig_vals
#eig_vecs
preds_wo_labels[0]
target
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs[0][0]
a = eig_pairs[0][1]
resp_np= np.array(target)
np.corrcoef(a, resp_np)
c1 =preds_wo_labels[:,0].T
c0 = np.mean(preds_wo_labels[:,0].T)
c0_std = np.std(c1)
preds_wo_labels_df.head()
print('Eigenvalues in descending order:')
for i in eig_pairs:
print(i)
'''
http://blog.districtdatalabs.com/principal-component-analysis-with-python
To get a better idea of how principal components describe the variance
in the data,
we will look at the explained variance ratio of
the first two principal components.
'''
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca
pca.fit_transform(preds_wo_labels_df)
print(pca.explained_variance_ratio_)
#Explained variance
#pca = PCA().fit(X_std)
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(preds_wo_labels_df)
pca = PCA().fit(X_std)
import matplotlib.pyplot as plt
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
pca = PCA(n_components=21)
pca.fit_transform(preds_wo_labels_df)
print(pca.explained_variance_ratio_)
'''
'''
#lets do t-SNE, PCA
#tsne, pca imports
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
'''
Summarizing the PCA approach
Listed below are the 6 general steps for performing a principal component analysis, which we will investigate in the following sections.
1. Take the whole dataset consisting of dd-dimensional samples ignoring the class labels
2. Compute the dd-dimensional mean vector (i.e., the means for every dimension of the whole dataset)
3. Compute the scatter matrix (alternatively, the covariance matrix) of the whole data set
4. Compute eigenvectors (ee1,ee2,...,eedee1,ee2,...,eed) and corresponding eigenvalues (λλ1,λλ2,...,λλdλλ1,λλ2,...,λλd)
5. Sort the eigenvectors by decreasing eigenvalues and choose kk eigenvectors with the largest eigenvalues to form a d×kd×k dimensional matrix WWWW(where every column represents an eigenvector)
6. Use this d×kd×k eigenvector matrix to transform the samples onto the new subspace. This can be summarized by the mathematical equation: yy=WWT×xxyy=WWT×xx (where xxxx is a d×1d×1-dimensional vector representing one sample, and yyyy is the transformed k×1k×1-dimensional sample in the new subspace.)
'''
'''
lets plot the data
'''
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from matplotlib.patches import FancyArrowPatch
class Arrow3D(FancyArrowPatch):
def __init__(self, xs, ys, zs, *args, **kwargs):
FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
self._verts3d = xs, ys, zs
def draw(self, renderer):
xs3d, ys3d, zs3d = self._verts3d
xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
FancyArrowPatch.draw(self, renderer)
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
plt.rcParams['legend.fontsize'] = 10
'''
lets separate samples between class1(response =0)
& class2(response=1)
'''
ax.plot(class1_sample[0,:], class1_sample[1,:], class1_sample[2,:], 'o', markersize=8, color='blue', alpha=0.5, label='class1')
ax.plot(class2_sample[0,:], class2_sample[1,:], class2_sample[2,:], '^', markersize=8, alpha=0.5, color='red', label='class2')
plt.title('Samples for class 1 and class 2')
ax.legend(loc='upper right')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment