Skip to content

Instantly share code, notes, and snippets.

@rlangone
Created April 16, 2019 06:44
Show Gist options
  • Save rlangone/71d93b68d38a89c6722f414fc96f4792 to your computer and use it in GitHub Desktop.
Save rlangone/71d93b68d38a89c6722f414fc96f4792 to your computer and use it in GitHub Desktop.
Code for post "Entropy in machine learning" on https://amethix.com/blog/
# Import libraries
import pandas as pd
import numpy as np
from scipy.stats import iqr
from numpy import histogram2d
from sklearn.metrics import mutual_info_score
# Read dataset about breast cancer detection
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv")
# Separate input and targets
target = df['Classification']
df.drop(['Classification'], axis=1, inplace=True)
# Define mutual information function
def minfo(x, y):
# Compute mutual information between x and y
bins_x = max(2,int(2*iqr(x)*len(x)**-(1/3))) # use Freedman-Diaconis's Rule of thumb
bins_y = max(2,int(2*iqr(y)*len(y)**-(1/3)))
c_xy = histogram2d(x, y, [bins_x,bins_y])[0]
mi = mutual_info_score(None, None, contingency=c_xy)
return mi
# Build MI matrix
num_features = df.shape[1]
MI_matrix = np.zeros((num_features,num_features))
for i,col_i in enumerate(df):
for j,col_j in enumerate(df):
MI_matrix[i,j] = minfo(df[col_i],df[col_j])
MI_df = pd.DataFrame(MI_matrix,columns = df.columns, index = df.columns)
print(MI_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment