Created
June 2, 2019 21:17
-
-
Save metafeather/c13d37f6d864d90ca55e13581e1864b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Missing Value Ratio: If the dataset has too many missing values, we use this approach to reduce the number of variables. We can drop the variables having a large number of missing values in them | |
# Low Variance filter: We apply this approach to identify and drop constant variables from the dataset. The target variable is not unduly affected by variables with low variance, and hence these variables can be safely dropped | |
# High Correlation filter: A pair of variables having high correlation increases multicollinearity in the dataset. So, we can use this technique to find highly correlated features and drop them accordingly | |
# Random Forest: This is one of the most commonly used techniques which tells us the importance of each feature present in the dataset. We can find the importance of each feature and keep the top most features, resulting in dimensionality reduction | |
# Both Backward Feature Elimination and Forward Feature Selection techniques take a lot of computational time and are thus generally used on smaller datasets | |
# Factor Analysis: This technique is best suited for situations where we have highly correlated set of variables. It divides the variables based on their correlation into different groups, and represents each group with a factor | |
# Principal Component Analysis: This is one of the most widely used techniques for dealing with linear data. It divides the data into a set of components which try to explain as much variance as possible | |
# Independent Component Analysis: We can use ICA to transform the data into independent components which describe the data using less number of components | |
# ISOMAP: We use this technique when the data is strongly non-linear | |
# t-SNE: This technique also works well when the data is strongly non-linear. It works extremely well for visualizations as well | |
# UMAP: This technique works well for high dimensional data. Its run-time is shorter as compared to t-SNE | |
# 1. Missing Value Ratio | |
# import required libraries | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# read the data | |
train=pd.read_csv("Train_UWu5bXk.csv") | |
# checking the percentage of missing values in each variable | |
train.isnull().sum()/len(train)*100 | |
# saving missing values in a variable | |
a = train.isnull().sum()/len(train)*100 | |
# saving column names in a variable | |
variables = train.columns | |
variable = [ ] | |
for i in range(0,12): | |
if a[i]<=20: #setting the threshold as 20% | |
variable.append(variables[i]) | |
# 2. Low Variance Filter | |
train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True) | |
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True) | |
train.isnull().sum()/len(train)*100 | |
train.var() | |
numeric = train[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']] | |
var = numeric.var() | |
numeric = numeric.columns | |
variable = [ ] | |
for i in range(0,len(var)): | |
if var[i]>=10: #setting the threshold as 10% | |
variable.append(numeric[i+1]) | |
# 3. High Correlation filter | |
df=train.drop('Item_Outlet_Sales', 1) | |
df.corr() | |
# 4. Random Forest | |
from sklearn.ensemble import RandomForestRegressor | |
df=df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1) | |
model = RandomForestRegressor(random_state=1, max_depth=10) | |
df=pd.get_dummies(df) | |
model.fit(df,train.Item_Outlet_Sales) | |
features = df.columns | |
importances = model.feature_importances_ | |
indices = np.argsort(importances)[-9:] # top 10 features | |
plt.title('Feature Importances') | |
plt.barh(range(len(indices)), importances[indices], color='b', align='center') | |
plt.yticks(range(len(indices)), [features[i] for i in indices]) | |
plt.xlabel('Relative Importance') | |
plt.show() | |
from sklearn.feature_selection import SelectFromModel | |
feature = SelectFromModel(model) | |
Fit = feature.fit_transform(df, train.Item_Outlet_Sales) | |
# 5. Backward Feature Elimination | |
from sklearn.linear_model import LinearRegression | |
from sklearn.feature_selection import RFE | |
from sklearn import datasets | |
lreg = LinearRegression() | |
rfe = RFE(lreg, 10) | |
rfe = rfe.fit_transform(df, train.Item_Outlet_Sales) | |
# 6. Forward Feature Selection | |
from sklearn.feature_selection import f_regression | |
ffs = f_regression(df,train.Item_Outlet_Sales ) | |
variable = [ ] | |
for i in range(0,len(df.columns)-1): | |
if ffs[0][i] >=10: | |
variable.append(df.columns[i]) | |
# 7. Factor Analysis | |
import pandas as pd | |
import numpy as np | |
from glob import glob | |
import cv2 | |
images = [cv2.imread(file) for file in glob('train/*.png')] | |
images = np.array(images) | |
images.shape | |
image = [] | |
for i in range(0,60000): | |
img = images[i].flatten() | |
image.append(img) | |
image = np.array(image) | |
train = pd.read_csv("train.csv") # Give the complete path of your train.csv file | |
feat_cols = [ 'pixel'+str(i) for i in range(image.shape[1]) ] | |
df = pd.DataFrame(image,columns=feat_cols) | |
df['label'] = train['label'] | |
from sklearn.decomposition import FactorAnalysis | |
FA = FactorAnalysis(n_components = 3).fit_transform(df[feat_cols].values) | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
plt.figure(figsize=(12,8)) | |
plt.title('Factor Analysis Components') | |
plt.scatter(FA[:,0], FA[:,1]) | |
plt.scatter(FA[:,1], FA[:,2]) | |
plt.scatter(FA[:,2],FA[:,0]) | |
# 8.Principal Component Analysis (PCA) | |
rndperm = np.random.permutation(df.shape[0]) | |
plt.gray() | |
fig = plt.figure(figsize=(20,10)) | |
for i in range(0,15): | |
ax = fig.add_subplot(3,5,i+1) | |
ax.matshow(df.loc[rndperm[i],feat_cols].values.reshape((28,28*3)).astype(float)) | |
from sklearn.decomposition import PCA | |
pca = PCA(n_components=4) | |
pca_result = pca.fit_transform(df[feat_cols].values) | |
plt.plot(range(4), pca.explained_variance_ratio_) | |
plt.plot(range(4), np.cumsum(pca.explained_variance_ratio_)) | |
plt.title("Component-wise and Cumulative Explained Variance") | |
import seaborn as sns | |
plt.style.use('fivethirtyeight') | |
fig, axarr = plt.subplots(2, 2, figsize=(12, 8)) | |
sns.heatmap(pca.components_[0, :].reshape(28, 84), ax=axarr[0][0], cmap='gray_r') | |
sns.heatmap(pca.components_[1, :].reshape(28, 84), ax=axarr[0][1], cmap='gray_r') | |
sns.heatmap(pca.components_[2, :].reshape(28, 84), ax=axarr[1][0], cmap='gray_r') | |
sns.heatmap(pca.components_[3, :].reshape(28, 84), ax=axarr[1][1], cmap='gray_r') | |
axarr[0][0].set_title( | |
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[0]*100), | |
fontsize=12 | |
) | |
axarr[0][1].set_title( | |
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[1]*100), | |
fontsize=12 | |
) | |
axarr[1][0].set_title( | |
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[2]*100), | |
fontsize=12 | |
) | |
axarr[1][1].set_title( | |
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[3]*100), | |
fontsize=12 | |
) | |
axarr[0][0].set_aspect('equal') | |
axarr[0][1].set_aspect('equal') | |
axarr[1][0].set_aspect('equal') | |
axarr[1][1].set_aspect('equal') | |
plt.suptitle('4-Component PCA') | |
from sklearn.decomposition import TruncatedSVD | |
svd = TruncatedSVD(n_components=3, random_state=42).fit_transform(df[feat_cols].values) | |
plt.figure(figsize=(12,8)) | |
plt.title('SVD Components') | |
plt.scatter(svd[:,0], svd[:,1]) | |
plt.scatter(svd[:,1], svd[:,2]) | |
plt.scatter(svd[:,2],svd[:,0]) | |
# 9. Independent Component Analysis | |
from sklearn.decomposition import FastICA | |
ICA = FastICA(n_components=3, random_state=12) | |
X=ICA.fit_transform(df[feat_cols].values) | |
plt.figure(figsize=(12,8)) | |
plt.title('ICA Components') | |
plt.scatter(X[:,0], X[:,1]) | |
plt.scatter(X[:,1], X[:,2]) | |
plt.scatter(X[:,2], X[:,0]) | |
# 10. Methods Based on Projections | |
from sklearn import manifold | |
trans_data = manifold.Isomap(n_neighbors=5, n_components=3, n_jobs=-1).fit_transform(df[feat_cols][:6000].values) | |
plt.figure(figsize=(12,8)) | |
plt.title('Decomposition using ISOMAP') | |
plt.scatter(trans_data[:,0], trans_data[:,1]) | |
plt.scatter(trans_data[:,1], trans_data[:,2]) | |
plt.scatter(trans_data[:,2], trans_data[:,0]) | |
# 11. t- Distributed Stochastic Neighbor Embedding (t-SNE) | |
from sklearn.manifold import TSNE | |
tsne = TSNE(n_components=3, n_iter=300).fit_transform(df[feat_cols][:6000].values) | |
plt.figure(figsize=(12,8)) | |
plt.title('t-SNE components') | |
plt.scatter(tsne[:,0], tsne[:,1]) | |
plt.scatter(tsne[:,1], tsne[:,2]) | |
plt.scatter(tsne[:,2], tsne[:,0]) | |
# 12. UMAP | |
import umap | |
umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=3).fit_transform(df[feat_cols][:6000].values) | |
plt.figure(figsize=(12,8)) | |
plt.title('Decomposition using UMAP') | |
plt.scatter(umap_data[:,0], umap_data[:,1]) | |
plt.scatter(umap_data[:,1], umap_data[:,2]) | |
plt.scatter(umap_data[:,2], umap_data[:,0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment