metafeather/dimensionality-reduction-techniques.py

## dimensionality-reduction-techniques.py
# Missing Value Ratio: If the dataset has too many missing values, we use this approach to reduce the number of variables. We can drop the variables having a large number of missing values in them
# Low Variance filter: We apply this approach to identify and drop constant variables from the dataset. The target variable is not unduly affected by variables with low variance, and hence these variables can be safely dropped
# High Correlation filter: A pair of variables having high correlation increases multicollinearity in the dataset. So, we can use this technique to find highly correlated features and drop them accordingly
# Random Forest: This is one of the most commonly used techniques which tells us the importance of each feature present in the dataset. We can find the importance of each feature and keep the top most features, resulting in dimensionality reduction
# Both Backward Feature Elimination and Forward Feature Selection techniques take a lot of computational time and are thus generally used on smaller datasets
# Factor Analysis: This technique is best suited for situations where we have highly correlated set of variables. It divides the variables based on their correlation into different groups, and represents each group with a factor
# Principal Component Analysis: This is one of the most widely used techniques for dealing with linear data. It divides the data into a set of components which try to explain as much variance as possible
# Independent Component Analysis: We can use ICA to transform the data into independent components which describe the data using less number of components
# ISOMAP: We use this technique when the data is strongly non-linear
# t-SNE: This technique also works well when the data is strongly non-linear. It works extremely well for visualizations as well
# UMAP: This technique works well for high dimensional data. Its run-time is shorter as compared to t-SNE

# 1. Missing Value Ratio
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# read the data
train=pd.read_csv("Train_UWu5bXk.csv")

# checking the percentage of missing values in each variable
train.isnull().sum()/len(train)*100

# saving missing values in a variable
a = train.isnull().sum()/len(train)*100
# saving column names in a variable
variables = train.columns
variable = [ ]
for i in range(0,12):
    if a[i]<=20:   #setting the threshold as 20%
        variable.append(variables[i])

# 2. Low Variance Filter
train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)

train.isnull().sum()/len(train)*100

train.var()

numeric = train[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']]
var = numeric.var()
numeric = numeric.columns
variable = [ ]
for i in range(0,len(var)):
    if var[i]>=10:   #setting the threshold as 10%
       variable.append(numeric[i+1])

# 3. High Correlation filter
df=train.drop('Item_Outlet_Sales', 1)
df.corr()

# 4. Random Forest
from sklearn.ensemble import RandomForestRegressor
df=df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)
model = RandomForestRegressor(random_state=1, max_depth=10)
df=pd.get_dummies(df)
model.fit(df,train.Item_Outlet_Sales)

features = df.columns
importances = model.feature_importances_
indices = np.argsort(importances)[-9:]  # top 10 features
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

from sklearn.feature_selection import SelectFromModel
feature = SelectFromModel(model)
Fit = feature.fit_transform(df, train.Item_Outlet_Sales)

# 5. Backward Feature Elimination
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn import datasets
lreg = LinearRegression()
rfe = RFE(lreg, 10)
rfe = rfe.fit_transform(df, train.Item_Outlet_Sales)

# 6. Forward Feature Selection
from sklearn.feature_selection import f_regression
ffs = f_regression(df,train.Item_Outlet_Sales )

variable = [ ]
for i in range(0,len(df.columns)-1):
    if ffs[0][i] >=10:
       variable.append(df.columns[i])

# 7. Factor Analysis
import pandas as pd
import numpy as np
from glob import glob
import cv2
images = [cv2.imread(file) for file in glob('train/*.png')]

images = np.array(images)
images.shape

image = []
for i in range(0,60000):
    img = images[i].flatten()
    image.append(img)
image = np.array(image)

train = pd.read_csv("train.csv")     # Give the complete path of your train.csv file
feat_cols = [ 'pixel'+str(i) for i in range(image.shape[1]) ]
df = pd.DataFrame(image,columns=feat_cols)
df['label'] = train['label']

from sklearn.decomposition import FactorAnalysis
FA = FactorAnalysis(n_components = 3).fit_transform(df[feat_cols].values)

%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.title('Factor Analysis Components')
plt.scatter(FA[:,0], FA[:,1])
plt.scatter(FA[:,1], FA[:,2])
plt.scatter(FA[:,2],FA[:,0])

# 8.Principal Component Analysis (PCA)
rndperm = np.random.permutation(df.shape[0])
plt.gray()
fig = plt.figure(figsize=(20,10))
for i in range(0,15):
    ax = fig.add_subplot(3,5,i+1)
    ax.matshow(df.loc[rndperm[i],feat_cols].values.reshape((28,28*3)).astype(float))

from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca_result = pca.fit_transform(df[feat_cols].values)

plt.plot(range(4), pca.explained_variance_ratio_)
plt.plot(range(4), np.cumsum(pca.explained_variance_ratio_))
plt.title("Component-wise and Cumulative Explained Variance")

import seaborn as sns
plt.style.use('fivethirtyeight')
fig, axarr = plt.subplots(2, 2, figsize=(12, 8))
sns.heatmap(pca.components_[0, :].reshape(28, 84), ax=axarr[0][0], cmap='gray_r')
sns.heatmap(pca.components_[1, :].reshape(28, 84), ax=axarr[0][1], cmap='gray_r')
sns.heatmap(pca.components_[2, :].reshape(28, 84), ax=axarr[1][0], cmap='gray_r')
sns.heatmap(pca.components_[3, :].reshape(28, 84), ax=axarr[1][1], cmap='gray_r')
axarr[0][0].set_title(
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[0]*100),
fontsize=12
)
axarr[0][1].set_title(
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[1]*100),
fontsize=12
)
axarr[1][0].set_title(
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[2]*100),
fontsize=12
)
axarr[1][1].set_title(
"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[3]*100),
fontsize=12
)
axarr[0][0].set_aspect('equal')
axarr[0][1].set_aspect('equal')
axarr[1][0].set_aspect('equal')
axarr[1][1].set_aspect('equal')

plt.suptitle('4-Component PCA')

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3, random_state=42).fit_transform(df[feat_cols].values)

plt.figure(figsize=(12,8))
plt.title('SVD Components')
plt.scatter(svd[:,0], svd[:,1])
plt.scatter(svd[:,1], svd[:,2])
plt.scatter(svd[:,2],svd[:,0])

# 9. Independent Component Analysis
from sklearn.decomposition import FastICA
ICA = FastICA(n_components=3, random_state=12)
X=ICA.fit_transform(df[feat_cols].values)

plt.figure(figsize=(12,8))
plt.title('ICA Components')
plt.scatter(X[:,0], X[:,1])
plt.scatter(X[:,1], X[:,2])
plt.scatter(X[:,2], X[:,0])

# 10. Methods Based on Projections
from sklearn import manifold
trans_data = manifold.Isomap(n_neighbors=5, n_components=3, n_jobs=-1).fit_transform(df[feat_cols][:6000].values)

plt.figure(figsize=(12,8))
plt.title('Decomposition using ISOMAP')
plt.scatter(trans_data[:,0], trans_data[:,1])
plt.scatter(trans_data[:,1], trans_data[:,2])
plt.scatter(trans_data[:,2], trans_data[:,0])

# 11. t- Distributed Stochastic Neighbor Embedding (t-SNE)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, n_iter=300).fit_transform(df[feat_cols][:6000].values)

plt.figure(figsize=(12,8))
plt.title('t-SNE components')
plt.scatter(tsne[:,0], tsne[:,1])
plt.scatter(tsne[:,1], tsne[:,2])
plt.scatter(tsne[:,2], tsne[:,0])

# 12. UMAP
import umap
umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=3).fit_transform(df[feat_cols][:6000].values)

plt.figure(figsize=(12,8))
plt.title('Decomposition using UMAP')
plt.scatter(umap_data[:,0], umap_data[:,1])
plt.scatter(umap_data[:,1], umap_data[:,2])
plt.scatter(umap_data[:,2], umap_data[:,0])
	# Missing Value Ratio: If the dataset has too many missing values, we use this approach to reduce the number of variables. We can drop the variables having a large number of missing values in them
	# Low Variance filter: We apply this approach to identify and drop constant variables from the dataset. The target variable is not unduly affected by variables with low variance, and hence these variables can be safely dropped
	# High Correlation filter: A pair of variables having high correlation increases multicollinearity in the dataset. So, we can use this technique to find highly correlated features and drop them accordingly
	# Random Forest: This is one of the most commonly used techniques which tells us the importance of each feature present in the dataset. We can find the importance of each feature and keep the top most features, resulting in dimensionality reduction
	# Both Backward Feature Elimination and Forward Feature Selection techniques take a lot of computational time and are thus generally used on smaller datasets
	# Factor Analysis: This technique is best suited for situations where we have highly correlated set of variables. It divides the variables based on their correlation into different groups, and represents each group with a factor
	# Principal Component Analysis: This is one of the most widely used techniques for dealing with linear data. It divides the data into a set of components which try to explain as much variance as possible
	# Independent Component Analysis: We can use ICA to transform the data into independent components which describe the data using less number of components
	# ISOMAP: We use this technique when the data is strongly non-linear
	# t-SNE: This technique also works well when the data is strongly non-linear. It works extremely well for visualizations as well
	# UMAP: This technique works well for high dimensional data. Its run-time is shorter as compared to t-SNE

	# 1. Missing Value Ratio
	# import required libraries
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	# read the data
	train=pd.read_csv("Train_UWu5bXk.csv")

	# checking the percentage of missing values in each variable
	train.isnull().sum()/len(train)*100

	# saving missing values in a variable
	a = train.isnull().sum()/len(train)*100
	# saving column names in a variable
	variables = train.columns
	variable = [ ]
	for i in range(0,12):
	if a[i]<=20: #setting the threshold as 20%
	variable.append(variables[i])

	# 2. Low Variance Filter
	train['Item_Weight'].fillna(train['Item_Weight'].median(), inplace=True)
	train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)

	train.isnull().sum()/len(train)*100

	train.var()

	numeric = train[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']]
	var = numeric.var()
	numeric = numeric.columns
	variable = [ ]
	for i in range(0,len(var)):
	if var[i]>=10: #setting the threshold as 10%
	variable.append(numeric[i+1])

	# 3. High Correlation filter
	df=train.drop('Item_Outlet_Sales', 1)
	df.corr()

	# 4. Random Forest
	from sklearn.ensemble import RandomForestRegressor
	df=df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)
	model = RandomForestRegressor(random_state=1, max_depth=10)
	df=pd.get_dummies(df)
	model.fit(df,train.Item_Outlet_Sales)

	features = df.columns
	importances = model.feature_importances_
	indices = np.argsort(importances)[-9:] # top 10 features
	plt.title('Feature Importances')
	plt.barh(range(len(indices)), importances[indices], color='b', align='center')
	plt.yticks(range(len(indices)), [features[i] for i in indices])
	plt.xlabel('Relative Importance')
	plt.show()

	from sklearn.feature_selection import SelectFromModel
	feature = SelectFromModel(model)
	Fit = feature.fit_transform(df, train.Item_Outlet_Sales)

	# 5. Backward Feature Elimination
	from sklearn.linear_model import LinearRegression
	from sklearn.feature_selection import RFE
	from sklearn import datasets
	lreg = LinearRegression()
	rfe = RFE(lreg, 10)
	rfe = rfe.fit_transform(df, train.Item_Outlet_Sales)

	# 6. Forward Feature Selection
	from sklearn.feature_selection import f_regression
	ffs = f_regression(df,train.Item_Outlet_Sales )

	variable = [ ]
	for i in range(0,len(df.columns)-1):
	if ffs[0][i] >=10:
	variable.append(df.columns[i])

	# 7. Factor Analysis
	import pandas as pd
	import numpy as np
	from glob import glob
	import cv2
	images = [cv2.imread(file) for file in glob('train/*.png')]

	images = np.array(images)
	images.shape

	image = []
	for i in range(0,60000):
	img = images[i].flatten()
	image.append(img)
	image = np.array(image)

	train = pd.read_csv("train.csv") # Give the complete path of your train.csv file
	feat_cols = [ 'pixel'+str(i) for i in range(image.shape[1]) ]
	df = pd.DataFrame(image,columns=feat_cols)
	df['label'] = train['label']

	from sklearn.decomposition import FactorAnalysis
	FA = FactorAnalysis(n_components = 3).fit_transform(df[feat_cols].values)

	%matplotlib inline
	import matplotlib.pyplot as plt
	plt.figure(figsize=(12,8))
	plt.title('Factor Analysis Components')
	plt.scatter(FA[:,0], FA[:,1])
	plt.scatter(FA[:,1], FA[:,2])
	plt.scatter(FA[:,2],FA[:,0])

	# 8.Principal Component Analysis (PCA)
	rndperm = np.random.permutation(df.shape[0])
	plt.gray()
	fig = plt.figure(figsize=(20,10))
	for i in range(0,15):
	ax = fig.add_subplot(3,5,i+1)
	ax.matshow(df.loc[rndperm[i],feat_cols].values.reshape((28,28*3)).astype(float))

	from sklearn.decomposition import PCA
	pca = PCA(n_components=4)
	pca_result = pca.fit_transform(df[feat_cols].values)

	plt.plot(range(4), pca.explained_variance_ratio_)
	plt.plot(range(4), np.cumsum(pca.explained_variance_ratio_))
	plt.title("Component-wise and Cumulative Explained Variance")

	import seaborn as sns
	plt.style.use('fivethirtyeight')
	fig, axarr = plt.subplots(2, 2, figsize=(12, 8))
	sns.heatmap(pca.components_[0, :].reshape(28, 84), ax=axarr[0][0], cmap='gray_r')
	sns.heatmap(pca.components_[1, :].reshape(28, 84), ax=axarr[0][1], cmap='gray_r')
	sns.heatmap(pca.components_[2, :].reshape(28, 84), ax=axarr[1][0], cmap='gray_r')
	sns.heatmap(pca.components_[3, :].reshape(28, 84), ax=axarr[1][1], cmap='gray_r')
	axarr[0][0].set_title(
	"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[0]*100),
	fontsize=12
	)
	axarr[0][1].set_title(
	"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[1]*100),
	fontsize=12
	)
	axarr[1][0].set_title(
	"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[2]*100),
	fontsize=12
	)
	axarr[1][1].set_title(
	"{0:.2f}% Explained Variance".format(pca.explained_variance_ratio_[3]*100),
	fontsize=12
	)
	axarr[0][0].set_aspect('equal')
	axarr[0][1].set_aspect('equal')
	axarr[1][0].set_aspect('equal')
	axarr[1][1].set_aspect('equal')

	plt.suptitle('4-Component PCA')

	from sklearn.decomposition import TruncatedSVD
	svd = TruncatedSVD(n_components=3, random_state=42).fit_transform(df[feat_cols].values)

	plt.figure(figsize=(12,8))
	plt.title('SVD Components')
	plt.scatter(svd[:,0], svd[:,1])
	plt.scatter(svd[:,1], svd[:,2])
	plt.scatter(svd[:,2],svd[:,0])

	# 9. Independent Component Analysis
	from sklearn.decomposition import FastICA
	ICA = FastICA(n_components=3, random_state=12)
	X=ICA.fit_transform(df[feat_cols].values)

	plt.figure(figsize=(12,8))
	plt.title('ICA Components')
	plt.scatter(X[:,0], X[:,1])
	plt.scatter(X[:,1], X[:,2])
	plt.scatter(X[:,2], X[:,0])

	# 10. Methods Based on Projections
	from sklearn import manifold
	trans_data = manifold.Isomap(n_neighbors=5, n_components=3, n_jobs=-1).fit_transform(df[feat_cols][:6000].values)

	plt.figure(figsize=(12,8))
	plt.title('Decomposition using ISOMAP')
	plt.scatter(trans_data[:,0], trans_data[:,1])
	plt.scatter(trans_data[:,1], trans_data[:,2])
	plt.scatter(trans_data[:,2], trans_data[:,0])

	# 11. t- Distributed Stochastic Neighbor Embedding (t-SNE)
	from sklearn.manifold import TSNE
	tsne = TSNE(n_components=3, n_iter=300).fit_transform(df[feat_cols][:6000].values)

	plt.figure(figsize=(12,8))
	plt.title('t-SNE components')
	plt.scatter(tsne[:,0], tsne[:,1])
	plt.scatter(tsne[:,1], tsne[:,2])
	plt.scatter(tsne[:,2], tsne[:,0])

	# 12. UMAP
	import umap
	umap_data = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=3).fit_transform(df[feat_cols][:6000].values)

	plt.figure(figsize=(12,8))
	plt.title('Decomposition using UMAP')
	plt.scatter(umap_data[:,0], umap_data[:,1])
	plt.scatter(umap_data[:,1], umap_data[:,2])
	plt.scatter(umap_data[:,2], umap_data[:,0])