Skip to content

Instantly share code, notes, and snippets.

@pprett
pprett / joblib_killer.py
Created June 20, 2014 16:00
joblib hangs if job segfaults
import numpy as np
from sklearn.ensemble import gradient_boosting
import time
from joblib import Parallel, delayed
class Bad(object):
tree_ = None
@pprett
pprett / lightning_convergence.py
Created October 6, 2014 11:11
Lightning CDClassifier does not converge
from lightning.impl.primal_cd import CDClassifier
from lightning.impl.datasets.samples_generator import make_classification
bin_dense, bin_target = make_classification(n_samples=20000, n_features=100, n_informative=5,
n_classes=2, random_state=0, flip_y=0.2)
est = CDClassifier(C=1.0, alpha=0.01, random_state=0, penalty="l2", loss="log", verbose=3, max_iter=100)
est.fit(bin_dense[:10000,:], bin_target[:10000])
est.score(bin_dense[10000:,:], bin_target[10000:])
@pprett
pprett / tksvm.py
Created November 12, 2010 10:24
A simple graphical frontend for scikit.learn Libsvm bindings.
"""
==========
Libsvm GUI
==========
A simple graphical frontend for Libsvm mainly intended for didactic
purposes. You can create data points by point and click and visualize
the decision region induced by different kernels and parameter settings.
To create positive examples click the left mouse button; to create
@pprett
pprett / mk_product_cat_dataset.py
Created March 24, 2011 13:46
Cross-lingual product category dataset creation script.
#!/usr/bin/python
"""Creates the product category dataset from the Cross-Lingual
Sentiment dataset [1]. The output can be used directly with the
CLSCL reference implementation in NUT [2].
Usage:
./mk_product_cat_dataset.py {en|de|fr|jp} {train|test|unlabeled} output_dir num_docs
@pprett
pprett / joblib_test.py
Created April 6, 2011 09:51
Joblib.Parallel explicit argument parsing
from __future__ import division
import gc
import numpy as np
from time import sleep
from ext.joblib import Parallel, delayed
from multiprocessing import Process, current_process
from scikits.learn import svm, linear_model
@pprett
pprett / linearsvc_vs_svc.py
Created May 24, 2011 12:00
High difference in classifier accuracies with LinearSVC and SVC v2
"""High difference in classifier accuracies with LinearSVC and SVC.
Get data.npz from [1].
[1] https://docs.google.com/leaf?id=0B1BhwRZOwyxRZTcxZDA1OWMtZjZkMy00YjgxLWI3ZTMtZjJkNGIyODAyOTQy&hl=en_US
"""
print __doc__
import numpy as np
from functools import partial
@pprett
pprett / njobsbug.py
Created May 24, 2011 11:00
LinearSVC bug with n_jobs
#!/usr/bin/python
import sys
import numpy as np
from pprint import pprint
from scikits.learn.cross_val import StratifiedKFold
from scikits.learn.grid_search import GridSearchCV
from scikits.learn import svm
from scikits.learn.metrics import zero_one_score, f1_score, classification_report
@pprett
pprett / benchmark_gbm.py
Created November 4, 2011 10:36
Benchmark R's gbm module via rpy2
"""
Benchmark script to bench R's gbm package via rpy2.
NOTE::
make sure you run
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib
"""
@pprett
pprett / bench_yahoo_ltrc.py
Created March 13, 2012 06:29
Sklearn Yahoo LTRC 2010 Benchmark script
import numpy as np
import svmlight_loader
from sklearn.ensemble import GradientBoostingRegressor
from time import time
ROOT_DIR = '/home/pprett/corpora/yahoo-ltrc-2010/data'
X_train, y_train = svmlight_loader.load_svmlight_file(ROOT_DIR + '/set1.train.txt',
n_features=700,
@pprett
pprett / bench_tree.py
Created July 11, 2012 12:17
Simple and stupid benchmark for sklearn DecisionTreeRegressor
import numpy as np
from sklearn import datasets
from sklearn.ensemble import gradient_boosting
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)