Last active
November 28, 2017 20:27
-
-
Save mvdoc/0c2574079dfde78ea649e7dc0a3feab0 to your computer and use it in GitHub Desktop.
Example with PyMVPA and joblib to run nested classification in parallel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mvpa2.suite import * | |
# increase verbosity a bit for now | |
verbose.level = 3 | |
# pre-seed RNG if you want to investigate the effects, thus | |
# needing reproducible results | |
#mvpa2.seed(3) | |
# we import Parallel and delayed from joblib to run in parallel | |
from joblib import Parallel, delayed | |
""" | |
For this simple example lets generate some fresh random data with 2 | |
relevant features and low SNR. | |
""" | |
dataset = normal_feature_dataset(perlabel=24, nlabels=2, nchunks=3, | |
nonbogus_features=[0, 1], | |
nfeatures=100, snr=3.0) | |
""" | |
For the demonstration of model selection benefit, lets first compute | |
cross-validated error using simple and popular kNN. | |
""" | |
clf_sample = kNN() | |
cv_sample = CrossValidation(clf_sample, NFoldPartitioner()) | |
verbose(1, "Estimating error using a sample classifier") | |
error_sample = np.mean(cv_sample(dataset)) | |
""" | |
For the convenience lets define a helpful function which we will use | |
twice -- once within cross-validation, and once on the whole dataset | |
""" | |
def select_best_clf(dataset_, clfs): | |
"""Select best model according to CVTE | |
Helper function which we will use twice -- once for proper nested | |
cross-validation, and once to see how big an optimistic bias due | |
to model selection could be if we simply provide an entire dataset. | |
Parameters | |
---------- | |
dataset_ : Dataset | |
clfs : list of Classifiers | |
Which classifiers to explore | |
Returns | |
------- | |
best_clf, best_error | |
""" | |
best_clf = None | |
best_error = None | |
for clf in clfs: | |
cv = CrossValidation(clf, NFoldPartitioner()) | |
# unfortunately we don't have ability to reassign clf atm | |
# cv.transerror.clf = clf | |
try: | |
error = np.mean(cv(dataset_)) | |
except LearnerError, e: | |
# skip the classifier if data was not appropriate and it | |
# failed to learn/predict at all | |
continue | |
if best_error is None or error < best_error: | |
best_clf = clf | |
best_error = error | |
verbose(4, "Classifier %s cv error=%.2f" % (clf.descr, error)) | |
verbose(3, "Selected the best out of %i classifiers %s with error %.2f" | |
% (len(clfs), best_clf.descr, best_error)) | |
return best_clf, best_error | |
# This function will run all classifiers for one single partitions | |
def _run_one_partition(isplit, partitions, classifiers): | |
best_clfs = {} | |
verbose(2, "Processing split #%i" % isplit) | |
dstrain, dstest = list(splitter.generate(partitions)) | |
best_clf, best_error = select_best_clf(dstrain, classifiers) | |
best_clfs[best_clf.descr] = best_clfs.get(best_clf.descr, 0) + 1 | |
# now that we have the best classifier, lets assess its transfer | |
# to the testing dataset while training on entire training | |
tm = TransferMeasure(best_clf, splitter, | |
postproc=BinaryFxNode(mean_mismatch_error, | |
space='targets'), | |
enable_ca=['stats']) | |
tm(partitions) | |
return tm, best_clfs | |
# Now let's run everything | |
best_clfs = {} | |
confusion = ConfusionMatrix() | |
verbose(1, "Estimating error using nested CV for model selection") | |
partitioner = NFoldPartitioner() | |
splitter = Splitter('partitions') | |
# Here we are using joblib Parallel to parallelize each partition | |
# Set n_jobs to the number of available cores (or how many you want to use) | |
out_parallel = Parallel(n_jobs=4, backend='threading')(delayed(_run_one_partition)(isplit, partitions, classifiers=clfswh['!gnpp'][:30]) | |
for isplit, partitions in enumerate(partitioner.generate(dataset))) | |
# Parallel retuns a list with the results of each parallel loop, so we need to | |
# unravel it to get the confusion matrix | |
tms_parallel, best_clfs_parallel = zip(*out_parallel) | |
for tm in tms: | |
confusion += tm.ca.stats | |
# and now we do it also for best_clfs | |
best_clfs = {} | |
for bc in best_clfs_parallel: | |
for key, value in bc.iteritems(): | |
best_clfs[key] = best_clfs.get(key, 0) + bc[key] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment