Skip to content

Instantly share code, notes, and snippets.

@pprett
Created November 4, 2011 10:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pprett/1339068 to your computer and use it in GitHub Desktop.
Save pprett/1339068 to your computer and use it in GitHub Desktop.
Benchmark R's gbm module via rpy2
"""
Benchmark script to bench R's gbm package via rpy2.
NOTE::
make sure you run
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib
"""
import numpy as np
import rpy2
from time import time
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.utils import check_random_state
from rpy2.robjects.numpy2ri import numpy2ri
from rpy2.robjects.packages import importr
import pylab as pl
gbm = importr('gbm')
def repeat(f):
def wrapper(*args, **kargs):
scores = []
for i in range(10):
scores.append(f(*args, random_state=i, **kargs))
scores = np.array(scores)
return scores.mean(axis=0), scores.std(axis=0)
return wrapper
# ignore overflows due to exp
#np.seterr(invalid='print', under='print', divide='print', over='ignore')
classification_params = {"distribution": "bernoulli", "shrinkage": 1.0,
"n.tree": 500, "bag.fraction": 0.5, "verbose": False,
"n.minobsinnode": 1, "interaction.depth": 1}
@repeat
def bench_random_gaussian(random_state=None):
rs = check_random_state(random_state)
shape = (12000, 10)
X = rs.normal(size=shape).reshape(shape)
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
error_rate = np.mean(pred != y_test)
return error_rate
@repeat
def bench_spam(random_state=None):
X = np.loadtxt("/home/pprett/corpora/spam/spambase.data", delimiter=",")
y = X[:, -1].ravel()
X = X[:, :-1]
f = open("/home/pprett/corpora/spam/spambase.names")
feature_names = np.array([l.split(":")[0] for l in f])
X, y = shuffle(X, y, random_state=random_state)
X_test, y_test = X[:1536], y[:1536]
X_train, y_train = X[1536:], y[1536:]
y_train[y_train == -1.0] = 0
y_test[y_test == -1.0] = 0
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
error_rate = np.mean(pred != y_test)
return error_rate
def bench_madelon():
X_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.data")
y_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.labels")
X_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.data")
y_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.labels")
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
score = np.mean(pred == y_test)
return score
def bench_arcene():
X_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.data")
y_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.labels")
X_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.data")
y_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.labels")
y_train[y_train == -1.0] = 0
y_test[y_test == -1.0] = 0
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **classification_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":classification_params["n.tree"]})
pred = (np.array(pred) >= 0.0).astype(np.float64)
score = np.mean(pred == y_test)
return score
regression_params = {"distribution": "gaussian", "shrinkage": 0.1,
"n.tree": 100, "bag.fraction": 1.0, "verbose": False,
"n.minobsinnode": 1, "interaction.depth": 4}
@repeat
def bench_boston(random_state=None):
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=random_state)
offset = int(X.shape[0] * 0.9)
X_train = X[:offset]
y_train = y[:offset]
X_test = X[offset:]
y_test = y[offset:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
@repeat
def bench_friedman1(random_state=None):
X, y = datasets.make_friedman1(n_samples=1200,
random_state=random_state, noise=1.0)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
@repeat
def bench_friedman2(random_state=None):
X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
@repeat
def bench_friedman3(random_state=None):
X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
X_train = numpy2ri(X_train)
X_test = numpy2ri(X_test)
y_train = numpy2ri(y_train)
model = gbm.gbm_fit(X_train, y_train, **regression_params)
pred = gbm.predict_gbm(model, X_test,
**{"n.tree":regression_params["n.tree"]})
pred = np.array(pred, dtype=np.float64)
mse = np.mean((pred - y_test) ** 2.0)
return mse
if __name__ == "__main__":
print "Example 10.2", bench_random_gaussian()
print "spam", bench_spam()
print "Madelon", bench_madelon()
print "Arcene", bench_arcene()
print "Boston", bench_boston()
print "Friedman#1", bench_friedman1()
print "Friedman#2", bench_friedman2()
print "Friedman#3", bench_friedman3()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment