Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Benchmark R's gbm module via rpy2
"""
Benchmark script to bench R's gbm package via rpy2.
NOTE::
make sure you run
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib
"""
import numpy as np
import rpy2
from time import time
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.utils import check_random_state
from rpy2.robjects.numpy2ri import numpy2ri
from rpy2.robjects.packages import importr
import pylab as pl
gbm = importr('gbm')
def repeat(f):
def wrapper(*args, **kargs):
scores = []
for i in range(10):
scores.append(f(*args, random_state=i, **kargs))
scores = np.array(scores)
return scores.mean(axis=0), scores.std(axis=0)
return wrapper
# ignore overflows due to exp
#np.seterr(invalid='print', under='print', divide='print', over='ignore')
classification_params = {"distribution": "bernoulli", "shrinkage": 1.0,
"n.tree": 500, "bag.fraction": 0.5, "verbose": False,
"n.minobsinnode": 1, "interaction.depth": 1}
@repeat
def bench_random_gaussian(random_state=None):
rs = check_random_state(random_state)
shape = (12000, 10)
X = rs.normal(size=shape).reshape(shape)
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
error_rate = np.mean(pred != y_test)
return error_rate
@repeat
def bench_spam(random_state=None):
X = np.loadtxt("/home/pprett/corpora/spam/spambase.data", delimiter=",")
y = X[:, -1].ravel()
X = X[:, :-1]
f = open("/home/pprett/corpora/spam/spambase.names")
feature_names = np.array([l.split(":")[0] for l in f])
X, y = shuffle(X, y, random_state=random_state)
X_test, y_test = X[:1536], y[:1536]
X_train, y_train = X[1536:], y[1536:]
y_train[y_train == -1.0] = 0
y_test[y_test == -1.0] = 0
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
error_rate = np.mean(pred != y_test)
return error_rate
def bench_madelon():
X_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.data")
y_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.labels")
X_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.data")
y_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.labels")
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
score = np.mean(pred == y_test)
return score
def bench_arcene():
X_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.data")
y_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.labels")
X_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.data")
y_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.labels")
y_train[y_train == -1.0] = 0
y_test[y_test == -1.0] = 0
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
score = np.mean(pred == y_test)
return score
regression_params = {"distribution": "gaussian", "shrinkage": 0.1,
"n.tree": 100, "bag.fraction": 1.0, "verbose": False,
"n.minobsinnode": 1, "interaction.depth": 4}
@repeat
def bench_boston(random_state=None):
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=random_state)
offset = int(X.shape[0] * 0.9)
X_train = X[:offset]
y_train = y[:offset]
X_test = X[offset:]
y_test = y[offset:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
@repeat
def bench_friedman1(random_state=None):
X, y = datasets.make_friedman1(n_samples=1200,
random_state=random_state, noise=1.0)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
@repeat
def bench_friedman2(random_state=None):
X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
@repeat
def bench_friedman3(random_state=None):
X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
if __name__ == "__main__":
print "Example 10.2", bench_random_gaussian()
print "spam", bench_spam()
print "Madelon", bench_madelon()
print "Arcene", bench_arcene()
print "Boston", bench_boston()
print "Friedman#1", bench_friedman1()
print "Friedman#2", bench_friedman2()
print "Friedman#3", bench_friedman3()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment