Created
April 25, 2022 02:05
-
-
Save SherazKhan/b40cd1fc5509882ebacdee95d49c8ce7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib | |
matplotlib.use('Qt5Agg') | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.feature_selection import f_classif, mutual_info_classif | |
from sklearn.preprocessing import MinMaxScaler | |
from itertools import cycle | |
from numpy import interp | |
import itertools | |
from sklearn.ensemble import ExtraTreesClassifier | |
from pca import pca | |
from sklearn.metrics import roc_curve, auc | |
from sklearn.preprocessing import label_binarize | |
plt.ion() | |
plt.rcParams.update({'font.size': 22}) | |
def autolabel(rects, ax, format_str='{:.3f}'): | |
"""Attach a text label above each bar in *rects*, displaying its height.""" | |
for rect in rects: | |
height = rect.get_height() | |
ax.annotate(format_str.format(height), | |
xy=(rect.get_x() + rect.get_width() / 2, height), | |
xytext=(0, 3), # 3 points vertical offset | |
textcoords="offset points", | |
ha='center', va='bottom') | |
def pca_diagnostics(X, n_components=2, n_feat=2): | |
model = pca(n_components=n_components) | |
results = model.fit_transform(X) | |
fig, ax = model.plot() | |
fig, ax = model.scatter() | |
fig, ax = model.biplot(n_feat=n_feat) | |
def read_data(X_csv, y_csv, drop_columns=['startSample', 'endSample', 'channel'], scale=True, so_nan_avoid=False): | |
data = pd.read_csv(X_csv) | |
data.drop(drop_columns, axis=1, inplace=True) | |
features = np.array(data.keys()) | |
labels = pd.read_csv(y_csv) | |
y = labels.to_numpy().ravel() | |
(y == 4).sum() | |
X = data.to_numpy() | |
if so_nan_avoid: | |
not_nan_ind = ~np.isnan(X[:, :14].mean(1)) | |
else: | |
not_nan_ind = ~np.isnan(X.mean(1)) | |
X = X[not_nan_ind, :] | |
y = y[not_nan_ind] | |
if scale: | |
scaler = MinMaxScaler().fit(X) | |
X = scaler.transform(X) | |
else: | |
scaler=None | |
return X, y, features, scaler | |
def plot_confusion_matrix(cm, classes, ax, | |
normalize=False, | |
title='Confusion matrix', | |
cmap=plt.cm.Blues): | |
""" | |
This function prints and plots the confusion matrix. | |
Normalization can be applied by setting `normalize=True`. | |
""" | |
print(cm) | |
print('') | |
ax.imshow(cm, interpolation='nearest', cmap=cmap) | |
ax.set_title(title) | |
tick_marks = np.arange(len(classes)) | |
plt.xticks(tick_marks, classes, rotation=-45) | |
plt.sca(ax) | |
plt.yticks(tick_marks, classes) | |
fmt = '.2f' if normalize else 'd' | |
thresh = cm.max() / 2. | |
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): | |
ax.text(j, i, format(cm[i, j], fmt), | |
horizontalalignment="center", | |
color="white" if cm[i, j] > thresh else "black") | |
ax.set_ylabel('True label') | |
ax.set_xlabel('Predicted label') | |
def plot_feature_corr(X, features, stem='train', save=False): | |
df = pd.DataFrame(X) | |
fig, ax = plt.subplots(figsize=(25, 25)) | |
corr = df.corr() | |
corr[corr==1] = 0 | |
corr[(corr <= 0.5) & (corr >= -0.5)] = 0 | |
hmap = sns.heatmap(corr, ax=ax, vmin=-1, vmax=1, cmap=sns.diverging_palette(0, 230, 90, 60, as_cmap=True), | |
annot=True, xticklabels=features, yticklabels=features) | |
hmap.set_xticklabels(features, rotation=30) | |
hmap.set_yticklabels(features, rotation=0) | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
if save: | |
fig.savefig(stem + '_corr_plot.png') | |
def plot_pca(X, save=False): | |
cov = np.cov(X.T) | |
eigen_values, eigen_vectors = np.linalg.eig(cov) | |
tot = sum(eigen_values) | |
var_exp = [(i / tot) for i in sorted(eigen_values, reverse=True)] | |
cum_var_exp = np.cumsum(var_exp) | |
fig, ax = plt.subplots(figsize=(25, 25)) | |
ax.bar(range(X.shape[1]), var_exp, alpha=0.5, align='center', label='Individual variance explained') | |
ax.step(range(X.shape[1]), cum_var_exp, where='mid', label='Cumulative variance explained') | |
ax.legend(loc='best') | |
ax.axis('tight') | |
ax.set_xlabel('n_components') | |
ax.set_ylabel('explained variance ratio') | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
if save: | |
fig.savefig('pca_plot.png') | |
def plot_feature_importance(X, y, features, save=False): | |
forest = ExtraTreesClassifier(n_estimators=1000, random_state=0) | |
forest.fit(X, y) | |
importances = forest.feature_importances_ | |
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) | |
indices = np.argsort(importances)[::-1] | |
fig, ax = plt.subplots(figsize=(25, 25)) | |
ax.set_title("Feature importance - Trees (Non-Parametric)") | |
ax.bar(x=features[indices], height=importances[indices], | |
color="r", yerr=std[indices], align="center") | |
ax.set_xticklabels(features[indices], rotation=45) | |
ax.set_xlabel('Features') | |
ax.set_ylabel('Importance') | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
if save: | |
fig.savefig('iportance-trees.png') | |
f_test, _ = f_classif(X, y) | |
f_test /= np.max(f_test) | |
indices_ft = np.argsort(f_test)[::-1] | |
mi = mutual_info_classif(X, y) | |
mi /= np.max(mi) | |
indices_mi = np.argsort(mi)[::-1] | |
fig, ax = plt.subplots(figsize=(25, 25)) | |
ax.set_title("Feature importance - Linear") | |
ax.bar(x=features[indices_ft], height=f_test[indices_ft], | |
color="r", align="center") | |
ax.set_xticklabels(features[indices_ft], rotation=45) | |
ax.set_xlabel('Features') | |
ax.set_ylabel('Importance') | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
if save: | |
fig.savefig('iportance-linear.png') | |
fig, ax = plt.subplots(figsize=(25, 25)) | |
ax.set_title("Feature importance - Mutual Information (Non-Linear)") | |
ax.bar(x=features[indices_mi], height=mi[indices_mi], | |
color="r", align="center") | |
ax.set_xticklabels(features[indices_mi], rotation=45) | |
ax.set_xlabel('Features') | |
ax.set_ylabel('Importance') | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
if save: | |
fig.savefig('iportance-mi.png') | |
importances /= np.max(importances) | |
comb_imp = importances + mi + f_test | |
comb_imp /= np.max(comb_imp) | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
indices_ci = np.argsort(comb_imp)[::-1] | |
fig, ax = plt.subplots(figsize=(25, 25)) | |
ax.set_title("Feature importance - Combined") | |
ax.bar(x=features[indices_ci], height=comb_imp[indices_ci], | |
color="r", align="center") | |
ax.set_xticklabels(features[indices_ci], rotation=45) | |
ax.set_xlabel('Features') | |
ax.set_ylabel('Importance') | |
mng = plt.get_current_fig_manager() | |
#mng.window.state('zoomed') | |
if save: | |
fig.savefig('iportance-ci.png') | |
return indices_ci | |
def get_class_weights(y): | |
y = np.asarray(y) | |
from sklearn.utils import class_weight | |
class_weight = class_weight.compute_class_weight('balanced' | |
, np.unique(y.reshape(y.shape[0], )) | |
, y.reshape(y.shape[0], )) | |
return {cls: float(weight) for cls, weight in zip(np.unique(y.reshape(y.shape[0], )), class_weight)} | |
def roc_analysis_full(Y_test, Y_scores, classes=(1, 2, 3, 4)): | |
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html | |
# Compute ROC curve and ROC area for each class | |
Y_test = label_binarize(Y_test, classes=classes) # make categorical | |
fpr = dict() | |
tpr = dict() | |
roc_auc = dict() | |
for i in range(len(classes)): | |
fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_scores[:, i]) | |
roc_auc[i] = auc(fpr[i], tpr[i]) | |
# Compute micro-average ROC curve and ROC area | |
fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_scores.ravel()) | |
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) | |
# First aggregate all false positive rates | |
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))])) | |
# Then interpolate all ROC curves at this points | |
mean_tpr = np.zeros_like(all_fpr) | |
for i in range(len(classes)): | |
mean_tpr += interp(all_fpr, fpr[i], tpr[i]) | |
# Finally average it and compute AUC | |
mean_tpr /= len(classes) | |
fpr["macro"] = all_fpr | |
tpr["macro"] = mean_tpr | |
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) | |
# Plot all ROC curves | |
lw = 2 | |
plt.figure(figsize=(20, 15)) | |
plt.plot(fpr["micro"], tpr["micro"], | |
label='micro-average ROC curve (area = {0:0.4f})' | |
''.format(roc_auc["micro"]), | |
color='deeppink', linestyle=':', linewidth=4) | |
plt.plot(fpr["macro"], tpr["macro"], | |
label='macro-average ROC curve (area = {0:0.4f})' | |
''.format(roc_auc["macro"]), | |
color='navy', linestyle=':', linewidth=4) | |
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'blue', 'black']) | |
for i, color in zip(range(len(classes)), colors): | |
plt.plot(fpr[i], tpr[i], color=color, lw=lw, | |
label='ROC curve of class {0} (area = {1:0.4f})' | |
''.format(i+1, roc_auc[i])) | |
plt.plot([0, 1], [0, 1], 'k--', lw=2) | |
plt.xlim([0.0, 1.0]) | |
plt.ylim([0.0, 1.05]) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('Extension of Receiver operating characteristic to multi-class') | |
plt.legend(loc="lower right") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment