-
-
Save rlangone/71d93b68d38a89c6722f414fc96f4792 to your computer and use it in GitHub Desktop.
Code for post "Entropy in machine learning" on https://amethix.com/blog/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries | |
import pandas as pd | |
import numpy as np | |
from scipy.stats import iqr | |
from numpy import histogram2d | |
from sklearn.metrics import mutual_info_score | |
# Read dataset about breast cancer detection | |
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv") | |
# Separate input and targets | |
target = df['Classification'] | |
df.drop(['Classification'], axis=1, inplace=True) | |
# Define mutual information function | |
def minfo(x, y): | |
# Compute mutual information between x and y | |
bins_x = max(2,int(2*iqr(x)*len(x)**-(1/3))) # use Freedman-Diaconis's Rule of thumb | |
bins_y = max(2,int(2*iqr(y)*len(y)**-(1/3))) | |
c_xy = histogram2d(x, y, [bins_x,bins_y])[0] | |
mi = mutual_info_score(None, None, contingency=c_xy) | |
return mi | |
# Build MI matrix | |
num_features = df.shape[1] | |
MI_matrix = np.zeros((num_features,num_features)) | |
for i,col_i in enumerate(df): | |
for j,col_j in enumerate(df): | |
MI_matrix[i,j] = minfo(df[col_i],df[col_j]) | |
MI_df = pd.DataFrame(MI_matrix,columns = df.columns, index = df.columns) | |
print(MI_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment