Last active
December 26, 2015 22:09
-
-
Save jcrudy/7221551 to your computer and use it in GitHub Desktop.
This is a comparison between the R package "earth" and a Python implementation of multivariate adaptive regression splines. It currently requires the pull request at https://github.com/scikit-learn/scikit-learn/pull/2285.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
============================= | |
Comparison with the R package | |
============================= | |
This script randomly generates earth-style models, then randomly generates data from those models and | |
fits earth models to those data using both the python (:class:`Earth`) and R implementations. It records the sample size, | |
m, the number of input dimensions, n, the number of forward pass iterations, the runtime, and the r^2 | |
statistic for each fit and writes the result to a CSV file. This script requires pandas, rpy2, and a | |
functioning R installation with the earth package installed. | |
''' | |
from __future__ import print_function | |
import numpy | |
import pandas.rpy.common as com | |
import rpy2.robjects as robjects | |
import time | |
import pandas | |
from sklearn.earth import Earth | |
print(__doc__) | |
class DataGenerator(object): | |
def __init__(self): | |
pass | |
def generate(self, m): | |
pass | |
class NoiseGenerator(DataGenerator): | |
def __init__(self, n): | |
self.n = n | |
def generate(self, m): | |
X = numpy.random.normal(size=(m, self.n)) | |
y = numpy.random.normal(size=m) | |
return X, y | |
class LinearGenerator(DataGenerator): | |
def __init__(self, n): | |
self.n = n | |
def generate(self, m): | |
X = numpy.random.normal(size=(m, self.n)) | |
beta = numpy.random.normal(size=self.n) | |
y = numpy.dot(X, beta) + numpy.random.normal(m) | |
return X, y | |
class VFunctionGenerator(DataGenerator): | |
def __init__(self, n): | |
self.n = n | |
def generate(self, m): | |
X = numpy.random.normal(size=(m, self.n)) | |
var = numpy.random.randint(self.n) | |
y = 10 * abs(X[:, var]) + numpy.random.normal(m) | |
return X, y | |
class UFunctionGenerator(DataGenerator): | |
def __init__(self, n): | |
self.n = n | |
def generate(self, m): | |
X = numpy.random.normal(size=(m, self.n)) | |
var = numpy.random.randint(self.n) | |
y = 10 * (X[:, var] ** 2) + numpy.random.normal(m) | |
return X, y | |
class RandomComplexityGenerator(DataGenerator): | |
def __init__(self, n, max_terms=10, max_degree=2): | |
self.n = n | |
self.max_terms = max_terms | |
self.max_degree = max_degree | |
def generate(self, m): | |
X = numpy.random.normal(size=(m, self.n)) | |
#Including the intercept | |
num_terms = numpy.random.randint(2, self.max_terms) | |
coef = 10 * numpy.random.normal(size=num_terms) | |
B = numpy.ones(shape=(m, num_terms)) | |
B[:, 0] += coef[0] | |
for i in range(1, num_terms): | |
degree = numpy.random.randint(1, self.max_degree) | |
for bf in range(degree): | |
knot = numpy.random.normal() | |
dir = 1 - 2 * numpy.random.binomial(1, .5) | |
var = numpy.random.randint(0, self.n) | |
B[:, i] *= (dir * (X[:, var] - knot)) * \ | |
(dir * (X[:, var] - knot) > 0) | |
y = numpy.dot(B, coef) + numpy.random.normal(size=m) | |
return X, y | |
def run_earth(X, y, **kwargs): | |
'''Run with the R package earth. Return prediction value, training time, and number of forward pass iterations.''' | |
r = robjects.r | |
m, n = X.shape | |
data = pandas.DataFrame(X) | |
data['y'] = y | |
r_data = com.convert_to_r_dataframe(data) | |
r('library(earth)') | |
r_func = ''' | |
run <- function(data, degree=1, fast.k=0, penalty=3.0){ | |
time = system.time(model <- earth(y~.,data=data,degree=degree,penalty=penalty))[3] | |
forward_terms = dim(summary(model)$prune.terms)[1] | |
y_pred = predict(model,data) | |
return(list(y_pred, time, forward_terms, model)) | |
} | |
''' | |
r(r_func) | |
run = r('run') | |
r_list = run( | |
**{'data': r_data, | |
'degree': kwargs['max_degree'], | |
'fast.k': 0, | |
'penalty': kwargs['penalty']}) | |
y_pred = numpy.array(r_list[0]).reshape(m) | |
time = r_list[1][0] | |
forward_terms = r_list[2][0] | |
return y_pred, time, (forward_terms - 1) / 2 | |
def run_pyearth(X, y, **kwargs): | |
'''Run with pyearth. Return prediction value, training time, and number of forward pass iterations.''' | |
model = Earth(**kwargs) | |
t0 = time.time() | |
model.fit(X, y) | |
t1 = time.time() | |
y_pred = model.predict(X) | |
forward_iterations = len(model.forward_trace()) - 1 | |
return y_pred, t1 - t0, forward_iterations | |
def compare(generator_class, sample_sizes, dimensions, repetitions, **kwargs): | |
'''Return a data table that includes m, n, pyearth or earth, training time, and number of forward pass iterations.''' | |
header = [ | |
'm', | |
'n', | |
'pyearth', | |
'earth', | |
'time', | |
'forward_iterations', | |
'rsq'] | |
data = [] | |
for n in dimensions: | |
generator = generator_class(n=n) | |
for m in sample_sizes: | |
for rep in range(repetitions): | |
X, y = generator.generate(m=m) | |
y_pred_r, time_r, iter_r = run_earth(X, y, **kwargs) | |
rsq_r = 1 - (numpy.sum((y - y_pred_r) ** 2)) / ( | |
numpy.sum((y - numpy.mean(y)) ** 2)) | |
data.append([m, n, 0, 1, time_r, iter_r, rsq_r]) | |
y_pred_py, time_py, iter_py = run_pyearth(X, y, **kwargs) | |
rsq_py = 1 - \ | |
(numpy.sum((y - y_pred_py) ** 2)) / ( | |
numpy.sum((y - numpy.mean(y)) ** 2)) | |
data.append([m, n, 1, 0, time_py, iter_py, rsq_py]) | |
return pandas.DataFrame(data, columns=header) | |
if __name__ == '__main__': | |
sample_sizes = [100, 200, 300, 500] | |
dimensions = [10, 20, 30] | |
rep = 5 | |
numpy.random.seed(1) | |
data = compare( | |
RandomComplexityGenerator, | |
sample_sizes, | |
dimensions, | |
rep, | |
max_degree=2, | |
penalty=3.0) | |
print(data) | |
data.to_csv('comparison.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment