Skip to content

Instantly share code, notes, and snippets.

@martinapugliese
Last active October 8, 2023 11:20
Show Gist options
  • Save martinapugliese/4405a430ac1271602d949b22f412bb30 to your computer and use it in GitHub Desktop.
Save martinapugliese/4405a430ac1271602d949b22f412bb30 to your computer and use it in GitHub Desktop.
# Imports
import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
import seaborn as sns
# Read the dataset into a Pandas dataframe - edit the path to where you downloaded it
df = pd.read_csv('diabetes/diabetes_012_health_indicators_BRFSS2015.csv')
# check counts by class - it's very imbalanced but won't matter for our purpose
df.groupby('Diabetes_012').count()
# Initialise base classifier - we don't aim at getting a good model so we won't tune it
clf = RandomForestClassifier()
# Separate train and test sets, fit model
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Diabetes_012'], df['Diabetes_012'])
clf.fit(X_train, y_train)
# check classification quality - you'll see that the minority class performs awful
print(classification_report(y_test, clf.predict(X_test)))
# store probability of the classified class (max one) and entropy for each instante in test set
max_prob, s = [], []
for item in clf.predict_proba(X_test):
max_prob.append(max(item))
s.append(entropy(item))
# Plot the histogram of the max probs
sns.set_style('darkgrid')
sns.histplot(max_prob,)
plt.xlabel('Prob of classified class')
plt.savefig('hist_probs.jpg')
# and the one of entropy
sns.set_style('darkgrid')
sns.histplot(s,)
plt.xlabel('Entropy of classification probs')
plt.savefig('hist_entropy.jpg')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment