Skip to content

Instantly share code, notes, and snippets.

@SherazKhan
Created April 25, 2022 02:05
Show Gist options
  • Save SherazKhan/b40cd1fc5509882ebacdee95d49c8ce7 to your computer and use it in GitHub Desktop.
Save SherazKhan/b40cd1fc5509882ebacdee95d49c8ce7 to your computer and use it in GitHub Desktop.
import matplotlib
matplotlib.use('Qt5Agg')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from itertools import cycle
from numpy import interp
import itertools
from sklearn.ensemble import ExtraTreesClassifier
from pca import pca
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
plt.ion()
plt.rcParams.update({'font.size': 22})
def autolabel(rects, ax, format_str='{:.3f}'):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate(format_str.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
def pca_diagnostics(X, n_components=2, n_feat=2):
model = pca(n_components=n_components)
results = model.fit_transform(X)
fig, ax = model.plot()
fig, ax = model.scatter()
fig, ax = model.biplot(n_feat=n_feat)
def read_data(X_csv, y_csv, drop_columns=['startSample', 'endSample', 'channel'], scale=True, so_nan_avoid=False):
data = pd.read_csv(X_csv)
data.drop(drop_columns, axis=1, inplace=True)
features = np.array(data.keys())
labels = pd.read_csv(y_csv)
y = labels.to_numpy().ravel()
(y == 4).sum()
X = data.to_numpy()
if so_nan_avoid:
not_nan_ind = ~np.isnan(X[:, :14].mean(1))
else:
not_nan_ind = ~np.isnan(X.mean(1))
X = X[not_nan_ind, :]
y = y[not_nan_ind]
if scale:
scaler = MinMaxScaler().fit(X)
X = scaler.transform(X)
else:
scaler=None
return X, y, features, scaler
def plot_confusion_matrix(cm, classes, ax,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
print(cm)
print('')
ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.set_title(title)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=-45)
plt.sca(ax)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
ax.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
ax.set_ylabel('True label')
ax.set_xlabel('Predicted label')
def plot_feature_corr(X, features, stem='train', save=False):
df = pd.DataFrame(X)
fig, ax = plt.subplots(figsize=(25, 25))
corr = df.corr()
corr[corr==1] = 0
corr[(corr <= 0.5) & (corr >= -0.5)] = 0
hmap = sns.heatmap(corr, ax=ax, vmin=-1, vmax=1, cmap=sns.diverging_palette(0, 230, 90, 60, as_cmap=True),
annot=True, xticklabels=features, yticklabels=features)
hmap.set_xticklabels(features, rotation=30)
hmap.set_yticklabels(features, rotation=0)
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
if save:
fig.savefig(stem + '_corr_plot.png')
def plot_pca(X, save=False):
cov = np.cov(X.T)
eigen_values, eigen_vectors = np.linalg.eig(cov)
tot = sum(eigen_values)
var_exp = [(i / tot) for i in sorted(eigen_values, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
fig, ax = plt.subplots(figsize=(25, 25))
ax.bar(range(X.shape[1]), var_exp, alpha=0.5, align='center', label='Individual variance explained')
ax.step(range(X.shape[1]), cum_var_exp, where='mid', label='Cumulative variance explained')
ax.legend(loc='best')
ax.axis('tight')
ax.set_xlabel('n_components')
ax.set_ylabel('explained variance ratio')
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
if save:
fig.savefig('pca_plot.png')
def plot_feature_importance(X, y, features, save=False):
forest = ExtraTreesClassifier(n_estimators=1000, random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
fig, ax = plt.subplots(figsize=(25, 25))
ax.set_title("Feature importance - Trees (Non-Parametric)")
ax.bar(x=features[indices], height=importances[indices],
color="r", yerr=std[indices], align="center")
ax.set_xticklabels(features[indices], rotation=45)
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
if save:
fig.savefig('iportance-trees.png')
f_test, _ = f_classif(X, y)
f_test /= np.max(f_test)
indices_ft = np.argsort(f_test)[::-1]
mi = mutual_info_classif(X, y)
mi /= np.max(mi)
indices_mi = np.argsort(mi)[::-1]
fig, ax = plt.subplots(figsize=(25, 25))
ax.set_title("Feature importance - Linear")
ax.bar(x=features[indices_ft], height=f_test[indices_ft],
color="r", align="center")
ax.set_xticklabels(features[indices_ft], rotation=45)
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
if save:
fig.savefig('iportance-linear.png')
fig, ax = plt.subplots(figsize=(25, 25))
ax.set_title("Feature importance - Mutual Information (Non-Linear)")
ax.bar(x=features[indices_mi], height=mi[indices_mi],
color="r", align="center")
ax.set_xticklabels(features[indices_mi], rotation=45)
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
if save:
fig.savefig('iportance-mi.png')
importances /= np.max(importances)
comb_imp = importances + mi + f_test
comb_imp /= np.max(comb_imp)
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
indices_ci = np.argsort(comb_imp)[::-1]
fig, ax = plt.subplots(figsize=(25, 25))
ax.set_title("Feature importance - Combined")
ax.bar(x=features[indices_ci], height=comb_imp[indices_ci],
color="r", align="center")
ax.set_xticklabels(features[indices_ci], rotation=45)
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
mng = plt.get_current_fig_manager()
#mng.window.state('zoomed')
if save:
fig.savefig('iportance-ci.png')
return indices_ci
def get_class_weights(y):
y = np.asarray(y)
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced'
, np.unique(y.reshape(y.shape[0], ))
, y.reshape(y.shape[0], ))
return {cls: float(weight) for cls, weight in zip(np.unique(y.reshape(y.shape[0], )), class_weight)}
def roc_analysis_full(Y_test, Y_scores, classes=(1, 2, 3, 4)):
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
# Compute ROC curve and ROC area for each class
Y_test = label_binarize(Y_test, classes=classes) # make categorical
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(classes)):
fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_scores[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_scores.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(len(classes)):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= len(classes)
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
lw = 2
plt.figure(figsize=(20, 15))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.4f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.4f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'blue', 'black'])
for i, color in zip(range(len(classes)), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.4f})'
''.format(i+1, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment