albahnsen/test_calibration_FScore.py

## test_calibration_FScore.py
__author__ = 'al'

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

from costcla.datasets import load_bankmarketing
from costcla.metrics import savings_score
from costcla.sampling import undersampling
from costcla.probcal import ROCConvexHull


data = load_bankmarketing()
sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets

X_u, y_u = undersampling(X_train, y_train, per=0.5)

prior1 = y_train.mean()
per1 = y_u.mean()
print 'Positive rates'
print 'Train   ' + str(prior1)
print 'Train_u ' + str(per1)
# Positive rates
# Train   0.127139652934
# Train_u 0.495628163829

f = RandomForestClassifier(random_state=0, n_estimators=500).fit(X_u, y_u)
RF_u_prob = f.predict_proba(X_test)[:, 1]  # With test dataset!!!
RF_u = f.predict(X_test)

# Calibration base rates (Elkan2001)
RF_u_calbr_prob = prior1*((RF_u_prob-RF_u_prob*per1)*1.0/(per1-per1*RF_u_prob+prior1*RF_u_prob-prior1*per1))
RF_u_calbr = (RF_u_calbr_prob > 0.5).astype(np.float)

# Calibration ROCCH
cal = ROCConvexHull()
cal.fit(y_test, RF_u_prob)
RF_u_calROCCH_prob = cal.predict_proba(RF_u_prob)
RF_u_calROCCH = (RF_u_calROCCH_prob > 0.5).astype(np.float)

# BMR
# For the marketing dataset (Similar to fraud)
# p_i > Ca / Amt_i
# p_i > 1 / cost_mat_test[i, 1]

RF_u_BMR = (RF_u_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
RF_u_calbr_BMR = (RF_u_calbr_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
RF_u_calROCCH_BMR = (RF_u_calROCCH_prob > 1 / cost_mat_test[:, 1]).astype(np.float)

# Results
def results(y, c, cost_mat):
    n_samples = y.shape[0]
    tp = float((c * y).sum())
    fp = float((c[np.nonzero(y == 0)[0]]).sum())
    fn = float((y[np.nonzero(c == 0)[0]]).sum())
    tn = float(n_samples - tp - fn - fp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1score = 2 * (precision * recall) / (precision + recall)
    sav = savings_score(y, c, cost_mat)
    return [precision, recall, f1score, sav]

models = ['RF_u', 'RF_u_BMR', 'RF_u_calbr', 'RF_u_calbr_BMR', 'RF_u_calROCCH', 'RF_u_calROCCH_BMR']
results_all = pd.DataFrame(index=models, columns=['pre', 'rec', 'F1Score', 'Sav'])
for model in models:
    results_all.loc[model] = results(y_test, locals()[model], cost_mat_test)

print results_all
#                       pre        rec    F1Score         Sav
# RF_u               0.2081507  0.6105398  0.3104575  0.07118456
# RF_u_BMR           0.1614743   0.746144  0.2654928   0.3686771
# RF_u_calbr         0.4537608  0.2365039  0.3109421  -0.2655911
# RF_u_calbr_BMR     0.2124968  0.5289203  0.3031866   0.4551066
# RF_u_calROCCH      0.6007605  0.1015424  0.1737218  -0.4874421
# RF_u_calROCCH_BMR  0.1987611  0.4742931  0.2801291   0.4829515
	__author__ = 'al'

	import numpy as np
	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.cross_validation import train_test_split

	from costcla.datasets import load_bankmarketing
	from costcla.metrics import savings_score
	from costcla.sampling import undersampling
	from costcla.probcal import ROCConvexHull


	data = load_bankmarketing()
	sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
	X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets

	X_u, y_u = undersampling(X_train, y_train, per=0.5)

	prior1 = y_train.mean()
	per1 = y_u.mean()
	print 'Positive rates'
	print 'Train ' + str(prior1)
	print 'Train_u ' + str(per1)
	# Positive rates
	# Train 0.127139652934
	# Train_u 0.495628163829

	f = RandomForestClassifier(random_state=0, n_estimators=500).fit(X_u, y_u)
	RF_u_prob = f.predict_proba(X_test)[:, 1] # With test dataset!!!
	RF_u = f.predict(X_test)

	# Calibration base rates (Elkan2001)
	RF_u_calbr_prob = prior1((RF_u_prob-RF_u_probper1)1.0/(per1-per1RF_u_prob+prior1RF_u_prob-prior1per1))
	RF_u_calbr = (RF_u_calbr_prob > 0.5).astype(np.float)

	# Calibration ROCCH
	cal = ROCConvexHull()
	cal.fit(y_test, RF_u_prob)
	RF_u_calROCCH_prob = cal.predict_proba(RF_u_prob)
	RF_u_calROCCH = (RF_u_calROCCH_prob > 0.5).astype(np.float)

	# BMR
	# For the marketing dataset (Similar to fraud)
	# p_i > Ca / Amt_i
	# p_i > 1 / cost_mat_test[i, 1]

	RF_u_BMR = (RF_u_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
	RF_u_calbr_BMR = (RF_u_calbr_prob > 1 / cost_mat_test[:, 1]).astype(np.float)
	RF_u_calROCCH_BMR = (RF_u_calROCCH_prob > 1 / cost_mat_test[:, 1]).astype(np.float)

	# Results
	def results(y, c, cost_mat):
	n_samples = y.shape[0]
	tp = float((c * y).sum())
	fp = float((c[np.nonzero(y == 0)[0]]).sum())
	fn = float((y[np.nonzero(c == 0)[0]]).sum())
	tn = float(n_samples - tp - fn - fp)
	precision = tp / (tp + fp)
	recall = tp / (tp + fn)
	f1score = 2 * (precision * recall) / (precision + recall)
	sav = savings_score(y, c, cost_mat)
	return [precision, recall, f1score, sav]

	models = ['RF_u', 'RF_u_BMR', 'RF_u_calbr', 'RF_u_calbr_BMR', 'RF_u_calROCCH', 'RF_u_calROCCH_BMR']
	results_all = pd.DataFrame(index=models, columns=['pre', 'rec', 'F1Score', 'Sav'])
	for model in models:
	results_all.loc[model] = results(y_test, locals()[model], cost_mat_test)

	print results_all
	# pre rec F1Score Sav
	# RF_u 0.2081507 0.6105398 0.3104575 0.07118456
	# RF_u_BMR 0.1614743 0.746144 0.2654928 0.3686771
	# RF_u_calbr 0.4537608 0.2365039 0.3109421 -0.2655911
	# RF_u_calbr_BMR 0.2124968 0.5289203 0.3031866 0.4551066
	# RF_u_calROCCH 0.6007605 0.1015424 0.1737218 -0.4874421
	# RF_u_calROCCH_BMR 0.1987611 0.4742931 0.2801291 0.4829515