Last active
August 29, 2015 14:13
-
-
Save albahnsen/0da0022e176b3a2179aa to your computer and use it in GitHub Desktop.
Analysis of probability calibration with BMR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'al' | |
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.cross_validation import train_test_split | |
from costcla.datasets import load_bankmarketing | |
from costcla.metrics import savings_score | |
from costcla.sampling import undersampling | |
from costcla.probcal import ROCConvexHull | |
data = load_bankmarketing() | |
sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) | |
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets | |
X_u, y_u = undersampling(X_train, y_train, per=0.5) | |
prior1 = y_train.mean() | |
per1 = y_u.mean() | |
print 'Positive rates' | |
print 'Train ' + str(prior1) | |
print 'Train_u ' + str(per1) | |
# Positive rates | |
# Train 0.127139652934 | |
# Train_u 0.495628163829 | |
f = RandomForestClassifier(random_state=0, n_estimators=500).fit(X_u, y_u) | |
RF_u_prob = f.predict_proba(X_test)[:, 1] # With test dataset!!! | |
RF_u = f.predict(X_test) | |
# Calibration base rates (Elkan2001) | |
RF_u_calbr_prob = prior1*((RF_u_prob-RF_u_prob*per1)*1.0/(per1-per1*RF_u_prob+prior1*RF_u_prob-prior1*per1)) | |
RF_u_calbr = (RF_u_calbr_prob > 0.5).astype(np.float) | |
# Calibration ROCCH | |
cal = ROCConvexHull() | |
cal.fit(y_test, RF_u_prob) | |
RF_u_calROCCH_prob = cal.predict_proba(RF_u_prob) | |
RF_u_calROCCH = (RF_u_calROCCH_prob > 0.5).astype(np.float) | |
# BMR | |
# For the marketing dataset (Similar to fraud) | |
# p_i > Ca / Amt_i | |
# p_i > 1 / cost_mat_test[i, 1] | |
RF_u_BMR = (RF_u_prob > 1 / cost_mat_test[:, 1]).astype(np.float) | |
RF_u_calbr_BMR = (RF_u_calbr_prob > 1 / cost_mat_test[:, 1]).astype(np.float) | |
RF_u_calROCCH_BMR = (RF_u_calROCCH_prob > 1 / cost_mat_test[:, 1]).astype(np.float) | |
# Results | |
def results(y, c, cost_mat): | |
n_samples = y.shape[0] | |
tp = float((c * y).sum()) | |
fp = float((c[np.nonzero(y == 0)[0]]).sum()) | |
fn = float((y[np.nonzero(c == 0)[0]]).sum()) | |
tn = float(n_samples - tp - fn - fp) | |
precision = tp / (tp + fp) | |
recall = tp / (tp + fn) | |
f1score = 2 * (precision * recall) / (precision + recall) | |
sav = savings_score(y, c, cost_mat) | |
return [precision, recall, f1score, sav] | |
models = ['RF_u', 'RF_u_BMR', 'RF_u_calbr', 'RF_u_calbr_BMR', 'RF_u_calROCCH', 'RF_u_calROCCH_BMR'] | |
results_all = pd.DataFrame(index=models, columns=['pre', 'rec', 'F1Score', 'Sav']) | |
for model in models: | |
results_all.loc[model] = results(y_test, locals()[model], cost_mat_test) | |
print results_all | |
# pre rec F1Score Sav | |
# RF_u 0.2081507 0.6105398 0.3104575 0.07118456 | |
# RF_u_BMR 0.1614743 0.746144 0.2654928 0.3686771 | |
# RF_u_calbr 0.4537608 0.2365039 0.3109421 -0.2655911 | |
# RF_u_calbr_BMR 0.2124968 0.5289203 0.3031866 0.4551066 | |
# RF_u_calROCCH 0.6007605 0.1015424 0.1737218 -0.4874421 | |
# RF_u_calROCCH_BMR 0.1987611 0.4742931 0.2801291 0.4829515 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment