Skip to content

Instantly share code, notes, and snippets.

@cwharland
Created April 27, 2015 03:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cwharland/7af4009baf55d59201c6 to your computer and use it in GitHub Desktop.
Save cwharland/7af4009baf55d59201c6 to your computer and use it in GitHub Desktop.
Create Fake Confusion Matrix Arrays
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
labels = ['N', 'L', 'R', 'A', 'P', 'V']
df = pd.DataFrame([
[1971, 19, 1, 8, 0, 1],
[16, 1940, 2, 23, 9, 10],
[8, 3, 181, 87, 0, 11],
[2, 25, 159, 1786, 16, 12],
[0, 24, 4, 8, 1958, 6],
[11, 12, 29, 11, 11, 1926] ], columns=labels, index=labels)
df.index.name = 'Actual'
df.columns.name = 'Predicted'
def create_arrays(df):
# Unstack to make tuples of actual,pred,count
df = df.unstack().reset_index()
# Pull the value labels and counts
actual = df['Actual'].values
predicted = df['Predicted'].values
totals = df.iloc[:,2].values
# Use list comprehension to create original arrays
y_true = [[curr_val]*n for (curr_val, n) in zip(actual, totals)]
y_predicted = [[curr_val]*n for (curr_val, n) in zip(predicted, totals)]
# They come nested so flatten them
y_true = [item for sublist in y_true for item in sublist]
y_predicted = [item for sublist in y_predicted for item in sublist]
return y_true, y_predicted
# Recreate the original confusion matrix and check for equality
y_t, y_p = create_arrays(df)
conf_mat = confusion_matrix(y_t,y_p)
check_labels = np.unique(y_t)
df_new = pd.DataFrame(conf_mat, columns=check_labels, index=check_labels).loc[labels, labels]
df_new.index.name = 'Actual'
df_new.columns.name = 'Predicted'
df == df_new
# And for the binary
labels = ['False', 'True']
df = pd.DataFrame([
[5, 3],
[2, 7]], columns=labels, index=labels)
df.index.name = 'Actual'
df.columns.name = 'Predicted'
# Recreate the original confusion matrix and check for equality
y_t, y_p = create_arrays(df)
conf_mat = confusion_matrix(y_t,y_p)
check_labels = np.unique(y_t)
df_new = pd.DataFrame(conf_mat, columns=check_labels, index=check_labels).loc[labels, labels]
df_new.index.name = 'Actual'
df_new.columns.name = 'Predicted'
df == df_new
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment