martinapugliese/classification_entropy_demonstration.py

## classification_entropy_demonstration.py
# Imports
import pandas as pd
import numpy as np
from scipy.stats import entropy

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns

# Read the dataset into a Pandas dataframe - edit the path to where you downloaded it
df = pd.read_csv('diabetes/diabetes_012_health_indicators_BRFSS2015.csv')

# check counts by class - it's very imbalanced but won't matter for our purpose
df.groupby('Diabetes_012').count()

# Initialise base classifier - we don't aim at getting a good model so we won't tune it
clf = RandomForestClassifier()

# Separate train and test sets, fit model
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Diabetes_012'], df['Diabetes_012'])
clf.fit(X_train, y_train)

# check classification quality - you'll see that the minority class performs awful
print(classification_report(y_test, clf.predict(X_test)))

# store probability of the classified class (max one) and entropy for each instante in test set
max_prob, s = [], []
for item in clf.predict_proba(X_test):
    max_prob.append(max(item))
    s.append(entropy(item))

# Plot the histogram of the max probs
sns.set_style('darkgrid')
sns.histplot(max_prob,)
plt.xlabel('Prob of classified class')
plt.savefig('hist_probs.jpg')

# and the one of entropy
sns.set_style('darkgrid')
sns.histplot(s,)
plt.xlabel('Entropy of classification probs')
plt.savefig('hist_entropy.jpg')
	# Imports
	import pandas as pd
	import numpy as np
	from scipy.stats import entropy

	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report

	from matplotlib import pyplot as plt
	import seaborn as sns

	# Read the dataset into a Pandas dataframe - edit the path to where you downloaded it
	df = pd.read_csv('diabetes/diabetes_012_health_indicators_BRFSS2015.csv')

	# check counts by class - it's very imbalanced but won't matter for our purpose
	df.groupby('Diabetes_012').count()

	# Initialise base classifier - we don't aim at getting a good model so we won't tune it
	clf = RandomForestClassifier()

	# Separate train and test sets, fit model
	X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Diabetes_012'], df['Diabetes_012'])
	clf.fit(X_train, y_train)

	# check classification quality - you'll see that the minority class performs awful
	print(classification_report(y_test, clf.predict(X_test)))

	# store probability of the classified class (max one) and entropy for each instante in test set
	max_prob, s = [], []
	for item in clf.predict_proba(X_test):
	max_prob.append(max(item))
	s.append(entropy(item))

	# Plot the histogram of the max probs
	sns.set_style('darkgrid')
	sns.histplot(max_prob,)
	plt.xlabel('Prob of classified class')
	plt.savefig('hist_probs.jpg')

	# and the one of entropy
	sns.set_style('darkgrid')
	sns.histplot(s,)
	plt.xlabel('Entropy of classification probs')
	plt.savefig('hist_entropy.jpg')