Peter Prettenhofer pprett

## boston.json
{"error": 42716.2954, "samples": 506, "value": [22.532806324110698], "label": "RM <= 6.94", "type": "split", "children": [{"error": 17317.3210, "samples": 430, "value": [19.93372093023257], "label": "LSTAT <= 14.40", "type": "split", "children": [{"error": 6632.2175, "samples": 255, "value": [23.349803921568636], "label": "DIS <= 1.38", "type": "split", "children": [{"error": 390.7280, "samples": 5, "value": [45.58], "label": "CRIM <= 10.59", "type": "split", "children": [{"error": 0.0000, "samples": 4, "value": [50.0], "label": "Leaf - 4", "type": "leaf"}, {"error": 0.0000, "samples": 1, "value": [27.9], "label": "Leaf - 5", "type": "leaf"}]}, {"error": 3721.1632, "samples": 250, "value": [22.90520000000001], "label": "RM <= 6.54", "type": "split", "children": [{"error": 1636.0675, "samples": 195, "value": [21.629743589743576], "label": "LSTAT <= 7.57", "type": "split", "children": [{"error": 129.6307, "samples": 43, "value": [23.969767441860473], "label": "TAX <= 222.50", "type": "split", "children": [{"err

## grid_search.py
"""Parallel grid search for sklearn's GradientBoosting.

This script uses IPython.parallel to run cross-validated
grid search on an IPython cluster. Each cell on the parameter grid
will be evaluated ``K`` times - results are stored in MongoDB.

The procedure tunes the number of trees ``n_estimators`` by averaging
the staged scores of the GBRT model averaged over all K folds.

You need an IPython ipcluster to connect to - for local use simply

## checkerboards.py
#!/usr/bin/python
"""
Run python checkerboards.py

Example from:
M. Hein (2009). Binary Classification under Sample Selection Bias, In Dataset Shift in Machine Learning, chap. 3, pp. 41-64. The MIT Press.
"""

from __future__ import division
import matplotlib

## readcsv.py
import sys
import codecs

from itertools import izip, count
from time import time

def iter_chunks(csvfile, chunk_size):
    chunk = []
    for row in csvfile:
        chunk.append(row)

## bench_rocsgd.py
import itertools

import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn import metrics


def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking

## bench_rcv1.py
"""
Benchmark sklearn's SGDClassifier on RCV1-ccat dataset.

So generate the input files see http://leon.bottou.org/projects/sgd .

Results
-------
ACC: 0.9479
AUC: 0.9476

## bench_random_forest.py
"""
Benchmark script to bench scikit-learn's RandomForestClassifier
vs. R's randomForest.

It uses rpy2 to call R from python. Timings for randomForest are
pessimistic due to a constant overhead by wrapping numpy matrices
in R data_frames. The effect of the overhead can be reduced
by increasing the number of trees.

Note: make sure the LD_LIBRARY_PATH is set for rpy2::

## test_subset.py
from numpy import genfromtxt
from sklearn.ensemble import GradientBoostingClassifier

def main():
	dataset = genfromtxt(open('train_subset.csv','r'), delimiter=',', dtype='float64')
	clf = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0, max_depth=1, random_state=0)

	X = dataset[:,1:]
	y = dataset[:,0]

## bench_tree.py
import numpy as np
from sklearn import datasets
from sklearn.ensemble import gradient_boosting
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)


## bench_yahoo_ltrc.py
import numpy as np
import svmlight_loader

from sklearn.ensemble import GradientBoostingRegressor

from time import time

ROOT_DIR = '/home/pprett/corpora/yahoo-ltrc-2010/data'
X_train, y_train = svmlight_loader.load_svmlight_file(ROOT_DIR + '/set1.train.txt',
                                                      n_features=700,
	"""Parallel grid search for sklearn's GradientBoosting.

	This script uses IPython.parallel to run cross-validated
	grid search on an IPython cluster. Each cell on the parameter grid
	will be evaluated ``K`` times - results are stored in MongoDB.

	The procedure tunes the number of trees ``n_estimators`` by averaging
	the staged scores of the GBRT model averaged over all K folds.

	You need an IPython ipcluster to connect to - for local use simply
	#!/usr/bin/python
	"""
	Run python checkerboards.py

	Example from:
	M. Hein (2009). Binary Classification under Sample Selection Bias, In Dataset Shift in Machine Learning, chap. 3, pp. 41-64. The MIT Press.
	"""

	from __future__ import division
	import matplotlib
	import sys
	import codecs

	from itertools import izip, count
	from time import time

	def iter_chunks(csvfile, chunk_size):
	chunk = []
	for row in csvfile:
	chunk.append(row)
	import itertools

	import numpy as np

	from sklearn.linear_model import SGDClassifier
	from sklearn import metrics


	def transform_pairwise(X, y):
	"""Transforms data into pairs with balanced labels for ranking
	"""
	Benchmark sklearn's SGDClassifier on RCV1-ccat dataset.

	So generate the input files see http://leon.bottou.org/projects/sgd .

	Results
	-------
	ACC: 0.9479
	AUC: 0.9476
	"""
	Benchmark script to bench scikit-learn's RandomForestClassifier
	vs. R's randomForest.

	It uses rpy2 to call R from python. Timings for randomForest are
	pessimistic due to a constant overhead by wrapping numpy matrices
	in R data_frames. The effect of the overhead can be reduced
	by increasing the number of trees.

	Note: make sure the LD_LIBRARY_PATH is set for rpy2::
	from numpy import genfromtxt
	from sklearn.ensemble import GradientBoostingClassifier

	def main():
	dataset = genfromtxt(open('train_subset.csv','r'), delimiter=',', dtype='float64')
	clf = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0, max_depth=1, random_state=0)

	X = dataset[:,1:]
	y = dataset[:,0]
	import numpy as np
	from sklearn import datasets
	from sklearn.ensemble import gradient_boosting
	from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
	from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

	X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
	X = X.astype(np.float32)
	import numpy as np
	import svmlight_loader

	from sklearn.ensemble import GradientBoostingRegressor

	from time import time

	ROOT_DIR = '/home/pprett/corpora/yahoo-ltrc-2010/data'
	X_train, y_train = svmlight_loader.load_svmlight_file(ROOT_DIR + '/set1.train.txt',
	n_features=700,