Skip to content

Instantly share code, notes, and snippets.

@alfard
Created July 1, 2015 15:48
Show Gist options
  • Save alfard/224dcc0868425d365892 to your computer and use it in GitHub Desktop.
Save alfard/224dcc0868425d365892 to your computer and use it in GitHub Desktop.
Facebook-Gradient Boosting-CV
import pandas as pd
import numpy as np
from sklearn import ensemble, feature_extraction, preprocessing
A=pd.read_pickle(('/home/alfard/Documents/Kaggle/Facebook-Robot/A.pk'))
#A = A.join(train[['outcome']], on='bidder_id')
A.shape
B=pd.read_pickle(('/home/alfard/Documents/Kaggle/Facebook-Robot/B.pk'))
#B=train[['bidder_id','outcome']]
A=pd.merge(A, B, how='inner',on='bidder_id')
A.shape
A=A.fillna(0)
#Forest test 1
import numpy as np
import csv
import random
from sklearn.cross_validation import train_test_split
from sklearn import ensemble, feature_extraction, preprocessing
from sklearn.metrics import roc_auc_score
Y = A.outcome.values
X = A.drop('outcome',axis=1)
X = X.drop('bidder_id', axis=1)
X=np.array(X)
######################################################################################
#Recuperer outcome
C=pd.read_pickle(('/home/alfard/Documents/Kaggle/Facebook-Robot/A.pk'))
#A = A.join(train[['outcome']], on='bidder_id')
C=C.fillna(0)
Id=C.bidder_id.values
C=C.drop('bidder_id', axis=1)
C=np.array(C)
######################################################################################
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import cross_validation
SEED = 898
params = {'n_estimators': 500, 'max_depth': 6,
'learning_rate': 0.001, 'max_features' : 'sqrt'}
#min_samples_split=40
print params
clf0 = ensemble.GradientBoostingClassifier(**params)
n = 100 # repeat the CV procedure 10 times to get more precise results
mean_auc = 0.0
Z=np.empty([len(C)])
for i in range(n):
# for each iteration, randomly hold out 20% of the data as CV set
X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
X, Y, test_size=.20, random_state=i*SEED)
# if you want to perform feature selection / hyperparameter
# optimization, this is where you want to do it
clf = CalibratedClassifierCV(clf0, method="sigmoid", cv=10)
# train model and make predictions
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_cv)[:, 1]
# print preds
# Faire un ensemble de 20 predictions
print "model ",i
Proba=clf.predict_proba(C)[:, 1]
# print Proba
Z=np.column_stack((Z,Proba))
auc = roc_auc_score(y_cv, preds)
print auc
mean_auc += auc
print "Mean ROC AUC: %f" % (mean_auc/n)
#print "Train ",auc
Z=np.delete(Z, 0, 1)
print Z.shape
GB_E=np.mean(Z, axis=1 )
np.savez('/home/alfard/Documents/Kaggle/Facebook-Robot/GB_E.npz',GB_E)
#GB_E=np.load('/home/alfard/Documents/Kaggle/Facebook-Robot/GB_E.npz')
#GB_E=GB_E['arr_0']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment