Skip to content

Instantly share code, notes, and snippets.

@mvdoc
Last active November 28, 2017 20:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mvdoc/0c2574079dfde78ea649e7dc0a3feab0 to your computer and use it in GitHub Desktop.
Save mvdoc/0c2574079dfde78ea649e7dc0a3feab0 to your computer and use it in GitHub Desktop.
Example with PyMVPA and joblib to run nested classification in parallel
from mvpa2.suite import *
# increase verbosity a bit for now
verbose.level = 3
# pre-seed RNG if you want to investigate the effects, thus
# needing reproducible results
#mvpa2.seed(3)
# we import Parallel and delayed from joblib to run in parallel
from joblib import Parallel, delayed
"""
For this simple example lets generate some fresh random data with 2
relevant features and low SNR.
"""
dataset = normal_feature_dataset(perlabel=24, nlabels=2, nchunks=3,
nonbogus_features=[0, 1],
nfeatures=100, snr=3.0)
"""
For the demonstration of model selection benefit, lets first compute
cross-validated error using simple and popular kNN.
"""
clf_sample = kNN()
cv_sample = CrossValidation(clf_sample, NFoldPartitioner())
verbose(1, "Estimating error using a sample classifier")
error_sample = np.mean(cv_sample(dataset))
"""
For the convenience lets define a helpful function which we will use
twice -- once within cross-validation, and once on the whole dataset
"""
def select_best_clf(dataset_, clfs):
"""Select best model according to CVTE
Helper function which we will use twice -- once for proper nested
cross-validation, and once to see how big an optimistic bias due
to model selection could be if we simply provide an entire dataset.
Parameters
----------
dataset_ : Dataset
clfs : list of Classifiers
Which classifiers to explore
Returns
-------
best_clf, best_error
"""
best_clf = None
best_error = None
for clf in clfs:
cv = CrossValidation(clf, NFoldPartitioner())
# unfortunately we don't have ability to reassign clf atm
# cv.transerror.clf = clf
try:
error = np.mean(cv(dataset_))
except LearnerError, e:
# skip the classifier if data was not appropriate and it
# failed to learn/predict at all
continue
if best_error is None or error < best_error:
best_clf = clf
best_error = error
verbose(4, "Classifier %s cv error=%.2f" % (clf.descr, error))
verbose(3, "Selected the best out of %i classifiers %s with error %.2f"
% (len(clfs), best_clf.descr, best_error))
return best_clf, best_error
# This function will run all classifiers for one single partitions
def _run_one_partition(isplit, partitions, classifiers):
best_clfs = {}
verbose(2, "Processing split #%i" % isplit)
dstrain, dstest = list(splitter.generate(partitions))
best_clf, best_error = select_best_clf(dstrain, classifiers)
best_clfs[best_clf.descr] = best_clfs.get(best_clf.descr, 0) + 1
# now that we have the best classifier, lets assess its transfer
# to the testing dataset while training on entire training
tm = TransferMeasure(best_clf, splitter,
postproc=BinaryFxNode(mean_mismatch_error,
space='targets'),
enable_ca=['stats'])
tm(partitions)
return tm, best_clfs
# Now let's run everything
best_clfs = {}
confusion = ConfusionMatrix()
verbose(1, "Estimating error using nested CV for model selection")
partitioner = NFoldPartitioner()
splitter = Splitter('partitions')
# Here we are using joblib Parallel to parallelize each partition
# Set n_jobs to the number of available cores (or how many you want to use)
out_parallel = Parallel(n_jobs=4, backend='threading')(delayed(_run_one_partition)(isplit, partitions, classifiers=clfswh['!gnpp'][:30])
for isplit, partitions in enumerate(partitioner.generate(dataset)))
# Parallel retuns a list with the results of each parallel loop, so we need to
# unravel it to get the confusion matrix
tms_parallel, best_clfs_parallel = zip(*out_parallel)
for tm in tms:
confusion += tm.ca.stats
# and now we do it also for best_clfs
best_clfs = {}
for bc in best_clfs_parallel:
for key, value in bc.iteritems():
best_clfs[key] = best_clfs.get(key, 0) + bc[key]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment