Benchmark R's gbm module via rpy2
""" | |
Benchmark script to bench R's gbm package via rpy2. | |
NOTE:: | |
make sure you run | |
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib | |
""" | |
import numpy as np | |
import rpy2 | |
from time import time | |
from sklearn import datasets | |
from sklearn.utils import shuffle | |
from sklearn.utils import check_random_state | |
from rpy2.robjects.numpy2ri import numpy2ri | |
from rpy2.robjects.packages import importr | |
import pylab as pl | |
gbm = importr('gbm') | |
def repeat(f): | |
def wrapper(*args, **kargs): | |
scores = [] | |
for i in range(10): | |
scores.append(f(*args, random_state=i, **kargs)) | |
scores = np.array(scores) | |
return scores.mean(axis=0), scores.std(axis=0) | |
return wrapper | |
# ignore overflows due to exp | |
#np.seterr(invalid='print', under='print', divide='print', over='ignore') | |
classification_params = {"distribution": "bernoulli", "shrinkage": 1.0, | |
"n.tree": 500, "bag.fraction": 0.5, "verbose": False, | |
"n.minobsinnode": 1, "interaction.depth": 1} | |
@repeat | |
def bench_random_gaussian(random_state=None): | |
rs = check_random_state(random_state) | |
shape = (12000, 10) | |
X = rs.normal(size=shape).reshape(shape) | |
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64) | |
X_train, X_test = X[:2000], X[2000:] | |
y_train, y_test = y[:2000], y[2000:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
error_rate = np.mean(pred != y_test) | |
return error_rate | |
@repeat | |
def bench_spam(random_state=None): | |
X = np.loadtxt("/home/pprett/corpora/spam/spambase.data", delimiter=",") | |
y = X[:, -1].ravel() | |
X = X[:, :-1] | |
f = open("/home/pprett/corpora/spam/spambase.names") | |
feature_names = np.array([l.split(":")[0] for l in f]) | |
X, y = shuffle(X, y, random_state=random_state) | |
X_test, y_test = X[:1536], y[:1536] | |
X_train, y_train = X[1536:], y[1536:] | |
y_train[y_train == -1.0] = 0 | |
y_test[y_test == -1.0] = 0 | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
error_rate = np.mean(pred != y_test) | |
return error_rate | |
def bench_madelon(): | |
X_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.data") | |
y_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.labels") | |
X_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.data") | |
y_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.labels") | |
y_train[y_train == -1] = 0 | |
y_test[y_test == -1] = 0 | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
score = np.mean(pred == y_test) | |
return score | |
def bench_arcene(): | |
X_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.data") | |
y_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.labels") | |
X_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.data") | |
y_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.labels") | |
y_train[y_train == -1.0] = 0 | |
y_test[y_test == -1.0] = 0 | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
score = np.mean(pred == y_test) | |
return score | |
regression_params = {"distribution": "gaussian", "shrinkage": 0.1, | |
"n.tree": 100, "bag.fraction": 1.0, "verbose": False, | |
"n.minobsinnode": 1, "interaction.depth": 4} | |
@repeat | |
def bench_boston(random_state=None): | |
boston = datasets.load_boston() | |
X, y = shuffle(boston.data, boston.target, random_state=random_state) | |
offset = int(X.shape[0] * 0.9) | |
X_train = X[:offset] | |
y_train = y[:offset] | |
X_test = X[offset:] | |
y_test = y[offset:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
@repeat | |
def bench_friedman1(random_state=None): | |
X, y = datasets.make_friedman1(n_samples=1200, | |
random_state=random_state, noise=1.0) | |
X_train, y_train = X[:200], y[:200] | |
X_test, y_test = X[200:], y[200:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
@repeat | |
def bench_friedman2(random_state=None): | |
X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) | |
X_train, y_train = X[:200], y[:200] | |
X_test, y_test = X[200:], y[200:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
@repeat | |
def bench_friedman3(random_state=None): | |
X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) | |
X_train, y_train = X[:200], y[:200] | |
X_test, y_test = X[200:], y[200:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
if __name__ == "__main__": | |
print "Example 10.2", bench_random_gaussian() | |
print "spam", bench_spam() | |
print "Madelon", bench_madelon() | |
print "Arcene", bench_arcene() | |
print "Boston", bench_boston() | |
print "Friedman#1", bench_friedman1() | |
print "Friedman#2", bench_friedman2() | |
print "Friedman#3", bench_friedman3() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment