Skip to content

Instantly share code, notes, and snippets.

@walterreade
Last active May 16, 2016 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walterreade/246314a9ff6196826bee8875f777490e to your computer and use it in GitHub Desktop.
Save walterreade/246314a9ff6196826bee8875f777490e to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Regularized Tree Ensemble
@author: gert.jacobusse@rogatio.nl
@license: FreeBSD
Originally posted:
https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/20207/why-every-good-script-is-using-extratreeclassifier-one-way-or-the-other/115621
"""
import numpy as np
from joblib import Parallel, delayed
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
class colsampler():
colsamples=None
data=None
def __init__(self, ncol, psample=0.5, nsample=1000, randseed=0):
np.random.seed(randseed)
self.ncol=ncol
self.nsample=nsample
self.colsamples=[[c for c in xrange(ncol) if np.random.random()<psample] for s in xrange(nsample)]
def setdata(self,data):
assert type(data)==type(np.array([]))
assert len(data.shape)==2
assert self.ncol==data.shape[1]
self.data=data
def getsample(self,fromsample=0):
assert type(self.data)==type(np.array([]))
for s in xrange(fromsample,self.nsample):
yield [row[self.colsamples[s]] for row in self.data]
def getsample_bynum(self,num):
assert type(self.data)!=type(None)
return [row[self.colsamples[num]] for row in self.data]
def fitmodel(model,cs,i,y):
model.fit(cs.getsample_bynum(i),y)
return model
def getholdoutpreds(model,cs,i,y):
n=len(y)
halfn=n/2
x=cs.getsample_bynum(i)
holdoutpreds=np.zeros(n)
model.fit(x[halfn:],y[halfn:])
holdoutpreds[:halfn]=model.predict(x[:halfn])
model.fit(x[:halfn],y[:halfn])
holdoutpreds[halfn:]=model.predict(x[halfn:])
return holdoutpreds
class rteClassifier():
def __init__(self,n_estimators=100,colsample_bytree=0.5,min_samples_split=1,
splitter='random',
stackingmodel=LogisticRegression(C=0.1,penalty='l1'),
samplingseed=0,
n_jobs=-1):
self.n_estimators=n_estimators
self.colsample_bytree=colsample_bytree
self.n_jobs=n_jobs
self.models=[DecisionTreeClassifier(
splitter=splitter,
max_features=None,
min_samples_split=min_samples_split
) for m in xrange(self.n_estimators)]
self.stackingmodel=stackingmodel
self.samplingseed=samplingseed
def fit(self,X,y):
perm=np.random.permutation(len(X))
X=np.array(X)[perm]
y=np.array(y)[perm]
local_cs=colsampler(len(X[0]),psample=self.colsample_bytree,nsample=self.n_estimators,randseed=self.samplingseed)
local_cs.setdata(np.array(X))
holdoutpreds=Parallel(n_jobs=self.n_jobs)(delayed(getholdoutpreds)(self.models[i],local_cs,i,y) for i in range(self.n_estimators))
self.stackingmodel.fit([[holdoutpreds[j][i] for j in xrange(self.n_estimators)]
for i in xrange(len(y))],y)
self.models=Parallel(n_jobs=self.n_jobs)(delayed(fitmodel)(self.models[i],local_cs,i,y) for i in range(self.n_estimators))
def predict_proba(self,X):
local_cs=colsampler(len(X[0]),psample=self.colsample_bytree,nsample=self.n_estimators,randseed=self.samplingseed)
local_cs.setdata(X)
preds=[self.models[i].predict(s) for i,s in enumerate(local_cs.getsample())]
return self.stackingmodel.predict_proba([[preds[j][i] for j in xrange(self.n_estimators)]
for i in xrange(len(preds[0]))])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment