Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Joblib.Parallel explicit argument parsing
from __future__ import division
import gc
import numpy as np
from time import sleep
from ext.joblib import Parallel, delayed
from multiprocessing import Process, current_process
from scikits.learn import svm, linear_model
def _score(i, X_train, y_train, X_test, y_test):
clf = svm.LinearSVC(C=10, tol=1e-3, dual=False).fit(X_train, y_train)
#clf = linear_model.SGDClassifier(alpha=0.001, n_iter=10).fit(X_train, y_train)
return (y_test == clf.predict(X_test)).mean()
if __name__ == '__main__':
import gzip
f = gzip.open('/home/pprett/workspace/scikit-learn/covtype.data.gz')
X = np.fromstring(f.read().replace(",", " "), dtype=np.float64, sep=" ",
count=-1)
X = X.reshape((581012, 55), order="C")
f.close()
y = X[:, -1]
X = X[:, :-1]
idx = np.arange(X.shape[0])
np.random.seed(13)
np.random.shuffle(idx)
offset = 100000 # 522911
train_idx = idx[:offset]
test_idx = idx[offset:]
X_train = np.array(X[train_idx], order="C")
y_train = np.array(y[train_idx], order="C")
X_test = np.array(X[test_idx], order="C")
y_test = np.array(y[test_idx], order="C")
# free memory
del X
del y
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
mean[10:] = 0.0
std[10:] = 1.0
X_train = (X_train-mean) / std
X_test = (X_test-mean) / std
del std
del mean
gc.collect()
pid = current_process().pid
print "master pid:", pid
print "|X_train| =", X_train.shape[0]
print "|X_test| =", X_test.shape[0]
print "X_train: %.2f MB" % (X_train.nbytes / 1024 / 1024)
print "X_test: %.2f MB" % (X_test.nbytes / 1024 / 1024)
X_train.flags.writeable = False
X_test.flags.writeable = False
sleep(2)
print "lets go!"
print
scores = Parallel(n_jobs=2, verbose=1)(
delayed(_score)(i, X_train, y_train, X_test, y_test)
for i in range(10))
print np.mean(scores)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment