Peter Prettenhofer pprett

## readcsv.py
import sys
import codecs

from itertools import izip, count
from time import time

def iter_chunks(csvfile, chunk_size):
    chunk = []
    for row in csvfile:
        chunk.append(row)

## lightning_convergence.py
from lightning.impl.primal_cd import CDClassifier
from lightning.impl.datasets.samples_generator import make_classification

bin_dense, bin_target = make_classification(n_samples=20000, n_features=100, n_informative=5,
                                            n_classes=2, random_state=0, flip_y=0.2)

est = CDClassifier(C=1.0, alpha=0.01, random_state=0, penalty="l2", loss="log", verbose=3, max_iter=100)
est.fit(bin_dense[:10000,:], bin_target[:10000])
est.score(bin_dense[10000:,:], bin_target[10000:])

## joblib_killer.py
import numpy as np
from sklearn.ensemble import gradient_boosting
import time

from joblib import Parallel, delayed

class Bad(object):
    tree_ = None


## bench_rcv1.py
"""
Benchmark sklearn's SGDClassifier on RCV1-ccat dataset.

So generate the input files see http://leon.bottou.org/projects/sgd .

Results
-------
ACC: 0.9479
AUC: 0.9476

## bench_rocsgd.py
import itertools

import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn import metrics


def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking

## bench_random_forest.py
"""
Benchmark script to bench scikit-learn's RandomForestClassifier
vs. R's randomForest.

It uses rpy2 to call R from python. Timings for randomForest are
pessimistic due to a constant overhead by wrapping numpy matrices
in R data_frames. The effect of the overhead can be reduced
by increasing the number of trees.

Note: make sure the LD_LIBRARY_PATH is set for rpy2::

## grid_search.py
"""Parallel grid search for sklearn's GradientBoosting.

This script uses IPython.parallel to run cross-validated
grid search on an IPython cluster. Each cell on the parameter grid
will be evaluated ``K`` times - results are stored in MongoDB.

The procedure tunes the number of trees ``n_estimators`` by averaging
the staged scores of the GBRT model averaged over all K folds.

You need an IPython ipcluster to connect to - for local use simply

## boston.json
{"error": 42716.2954, "samples": 506, "value": [22.532806324110698], "label": "RM <= 6.94", "type": "split", "children": [{"error": 17317.3210, "samples": 430, "value": [19.93372093023257], "label": "LSTAT <= 14.40", "type": "split", "children": [{"error": 6632.2175, "samples": 255, "value": [23.349803921568636], "label": "DIS <= 1.38", "type": "split", "children": [{"error": 390.7280, "samples": 5, "value": [45.58], "label": "CRIM <= 10.59", "type": "split", "children": [{"error": 0.0000, "samples": 4, "value": [50.0], "label": "Leaf - 4", "type": "leaf"}, {"error": 0.0000, "samples": 1, "value": [27.9], "label": "Leaf - 5", "type": "leaf"}]}, {"error": 3721.1632, "samples": 250, "value": [22.90520000000001], "label": "RM <= 6.54", "type": "split", "children": [{"error": 1636.0675, "samples": 195, "value": [21.629743589743576], "label": "LSTAT <= 7.57", "type": "split", "children": [{"error": 129.6307, "samples": 43, "value": [23.969767441860473], "label": "TAX <= 222.50", "type": "split", "children": [{"err

## test_subset.py
from numpy import genfromtxt
from sklearn.ensemble import GradientBoostingClassifier

def main():
	dataset = genfromtxt(open('train_subset.csv','r'), delimiter=',', dtype='float64')
	clf = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0, max_depth=1, random_state=0)

	X = dataset[:,1:]
	y = dataset[:,0]

## bench_tree.py
import numpy as np
from sklearn import datasets
from sklearn.ensemble import gradient_boosting
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)
	import sys
	import codecs

	from itertools import izip, count
	from time import time

	def iter_chunks(csvfile, chunk_size):
	chunk = []
	for row in csvfile:
	chunk.append(row)
	from lightning.impl.primal_cd import CDClassifier
	from lightning.impl.datasets.samples_generator import make_classification

	bin_dense, bin_target = make_classification(n_samples=20000, n_features=100, n_informative=5,
	n_classes=2, random_state=0, flip_y=0.2)

	est = CDClassifier(C=1.0, alpha=0.01, random_state=0, penalty="l2", loss="log", verbose=3, max_iter=100)
	est.fit(bin_dense[:10000,:], bin_target[:10000])
	est.score(bin_dense[10000:,:], bin_target[10000:])
	import numpy as np
	from sklearn.ensemble import gradient_boosting
	import time

	from joblib import Parallel, delayed

	class Bad(object):
	tree_ = None
	"""
	Benchmark sklearn's SGDClassifier on RCV1-ccat dataset.

	So generate the input files see http://leon.bottou.org/projects/sgd .

	Results
	-------
	ACC: 0.9479
	AUC: 0.9476
	import itertools

	import numpy as np

	from sklearn.linear_model import SGDClassifier
	from sklearn import metrics


	def transform_pairwise(X, y):
	"""Transforms data into pairs with balanced labels for ranking
	"""
	Benchmark script to bench scikit-learn's RandomForestClassifier
	vs. R's randomForest.

	It uses rpy2 to call R from python. Timings for randomForest are
	pessimistic due to a constant overhead by wrapping numpy matrices
	in R data_frames. The effect of the overhead can be reduced
	by increasing the number of trees.

	Note: make sure the LD_LIBRARY_PATH is set for rpy2::
	"""Parallel grid search for sklearn's GradientBoosting.

	This script uses IPython.parallel to run cross-validated
	grid search on an IPython cluster. Each cell on the parameter grid
	will be evaluated ``K`` times - results are stored in MongoDB.

	The procedure tunes the number of trees ``n_estimators`` by averaging
	the staged scores of the GBRT model averaged over all K folds.

	You need an IPython ipcluster to connect to - for local use simply
	from numpy import genfromtxt
	from sklearn.ensemble import GradientBoostingClassifier

	def main():
	dataset = genfromtxt(open('train_subset.csv','r'), delimiter=',', dtype='float64')
	clf = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0, max_depth=1, random_state=0)

	X = dataset[:,1:]
	y = dataset[:,0]
	import numpy as np
	from sklearn import datasets
	from sklearn.ensemble import gradient_boosting
	from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
	from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

	X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
	X = X.astype(np.float32)