Skip to content

Instantly share code, notes, and snippets.

@albahnsen
Last active August 29, 2015 14:13
Show Gist options
  • Save albahnsen/0da0022e176b3a2179aa to your computer and use it in GitHub Desktop.
Save albahnsen/0da0022e176b3a2179aa to your computer and use it in GitHub Desktop.
Analysis of probability calibration with BMR
__author__ = 'al'
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from costcla.datasets import load_bankmarketing
from costcla.metrics import savings_score
from costcla.sampling import undersampling
from costcla.probcal import ROCConvexHull
data = load_bankmarketing()
sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
X_u, y_u = undersampling(X_train, y_train, per=0.5)
prior1 = y_train.mean()
per1 = y_u.mean()
print 'Positive rates'
print 'Train ' + str(prior1)
print 'Train_u ' + str(per1)
# Positive rates
# Train 0.127139652934
# Train_u 0.495628163829
f = RandomForestClassifier(random_state=0, n_estimators=500).fit(X_u, y_u)
RF_u_prob = f.predict_proba(X_test)[:, 1] # With test dataset!!!
RF_u = f.predict(X_test)
# Calibration base rates (Elkan2001)
RF_u_calbr_prob = prior1*((RF_u_prob-RF_u_prob*per1)*1.0/(per1-per1*RF_u_prob+prior1*RF_u_prob-prior1*per1))
RF_u_calbr = (RF_u_calbr_prob > 0.5).astype(np.float)
# Calibration ROCCH
cal = ROCConvexHull()
cal.fit(y_test, RF_u_prob)
RF_u_calROCCH_prob = cal.predict_proba(RF_u_prob)
RF_u_calROCCH = (RF_u_calROCCH_prob > 0.5).astype(np.float)
# BMR
# For the marketing dataset (Similar to fraud)
# p_i > Ca / Amt_i
# p_i > 1 / cost_mat_test[i, 1]
RF_u_BMR = (RF_u_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
RF_u_calbr_BMR = (RF_u_calbr_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
RF_u_calROCCH_BMR = (RF_u_calROCCH_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
# Results
def results(y, c, cost_mat):
n_samples = y.shape[0]
tp = float((c * y).sum())
fp = float((c[np.nonzero(y == 0)[0]]).sum())
fn = float((y[np.nonzero(c == 0)[0]]).sum())
tn = float(n_samples - tp - fn - fp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1score = 2 * (precision * recall) / (precision + recall)
sav = savings_score(y, c, cost_mat)
return [precision, recall, f1score, sav]
models = ['RF_u', 'RF_u_BMR', 'RF_u_calbr', 'RF_u_calbr_BMR', 'RF_u_calROCCH', 'RF_u_calROCCH_BMR']
results_all = pd.DataFrame(index=models, columns=['pre', 'rec', 'F1Score', 'Sav'])
for model in models:
results_all.loc[model] = results(y_test, locals()[model], cost_mat_test)
print results_all
# pre rec F1Score Sav
# RF_u 0.2081507 0.6105398 0.3104575 0.07118456
# RF_u_BMR 0.1614743 0.746144 0.2654928 0.3686771
# RF_u_calbr 0.4537608 0.2365039 0.3109421 -0.2655911
# RF_u_calbr_BMR 0.2124968 0.5289203 0.3031866 0.4551066
# RF_u_calROCCH 0.6007605 0.1015424 0.1737218 -0.4874421
# RF_u_calROCCH_BMR 0.1987611 0.4742931 0.2801291 0.4829515
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment