mvdoc/nested_cv_parallel.py

## nested_cv_parallel.py
from mvpa2.suite import *
# increase verbosity a bit for now
verbose.level = 3
# pre-seed RNG if you want to investigate the effects, thus
# needing reproducible results
#mvpa2.seed(3)
# we import Parallel and delayed from joblib to run in parallel
from joblib import Parallel, delayed

"""
For this simple example lets generate some fresh random data with 2
relevant features and low SNR.
"""

dataset = normal_feature_dataset(perlabel=24, nlabels=2, nchunks=3,
                                 nonbogus_features=[0, 1],
                                 nfeatures=100, snr=3.0)

"""
For the demonstration of model selection benefit, lets first compute
cross-validated error using simple and popular kNN.
"""

clf_sample = kNN()
cv_sample = CrossValidation(clf_sample, NFoldPartitioner())

verbose(1, "Estimating error using a sample classifier")
error_sample = np.mean(cv_sample(dataset))

"""
For the convenience lets define a helpful function which we will use
twice -- once within cross-validation, and once on the whole dataset
"""

def select_best_clf(dataset_, clfs):
    """Select best model according to CVTE

    Helper function which we will use twice -- once for proper nested
    cross-validation, and once to see how big an optimistic bias due
    to model selection could be if we simply provide an entire dataset.

    Parameters
    ----------
    dataset_ : Dataset
    clfs : list of Classifiers
      Which classifiers to explore

    Returns
    -------
    best_clf, best_error
    """
    best_clf = None
    best_error = None
    for clf in clfs:
        cv = CrossValidation(clf, NFoldPartitioner())
        # unfortunately we don't have ability to reassign clf atm
        # cv.transerror.clf = clf
        try:
            error = np.mean(cv(dataset_))
        except LearnerError, e:
            # skip the classifier if data was not appropriate and it
            # failed to learn/predict at all
            continue
        if best_error is None or error < best_error:
            best_clf = clf
            best_error = error
        verbose(4, "Classifier %s cv error=%.2f" % (clf.descr, error))
    verbose(3, "Selected the best out of %i classifiers %s with error %.2f"
            % (len(clfs), best_clf.descr, best_error))
    return best_clf, best_error


# This function will run all classifiers for one single partitions
def _run_one_partition(isplit, partitions, classifiers):
    best_clfs = {}
    verbose(2, "Processing split #%i" % isplit)
    dstrain, dstest = list(splitter.generate(partitions))
    best_clf, best_error = select_best_clf(dstrain, classifiers)
    best_clfs[best_clf.descr] = best_clfs.get(best_clf.descr, 0) + 1
    # now that we have the best classifier, lets assess its transfer
    # to the testing dataset while training on entire training
    tm = TransferMeasure(best_clf, splitter,
                         postproc=BinaryFxNode(mean_mismatch_error,
                                               space='targets'),
                         enable_ca=['stats'])
    tm(partitions)
    return tm, best_clfs

# Now let's run everything
best_clfs = {}
confusion = ConfusionMatrix()
verbose(1, "Estimating error using nested CV for model selection")
partitioner = NFoldPartitioner()
splitter = Splitter('partitions')

# Here we are using joblib Parallel to parallelize each partition
# Set n_jobs to the number of available cores (or how many you want to use)
out_parallel = Parallel(n_jobs=4, backend='threading')(delayed(_run_one_partition)(isplit, partitions, classifiers=clfswh['!gnpp'][:30])
                         for isplit, partitions in enumerate(partitioner.generate(dataset)))

# Parallel retuns a list with the results of each parallel loop, so we need to
# unravel it to get the confusion matrix
tms_parallel, best_clfs_parallel = zip(*out_parallel)

for tm in tms:
    confusion += tm.ca.stats

# and now we do it also for best_clfs
best_clfs = {}

for bc in best_clfs_parallel:
    for key, value in bc.iteritems():
        best_clfs[key] = best_clfs.get(key, 0) + bc[key]
	from mvpa2.suite import *
	# increase verbosity a bit for now
	verbose.level = 3
	# pre-seed RNG if you want to investigate the effects, thus
	# needing reproducible results
	#mvpa2.seed(3)
	# we import Parallel and delayed from joblib to run in parallel
	from joblib import Parallel, delayed

	"""
	For this simple example lets generate some fresh random data with 2
	relevant features and low SNR.
	"""

	dataset = normal_feature_dataset(perlabel=24, nlabels=2, nchunks=3,
	nonbogus_features=[0, 1],
	nfeatures=100, snr=3.0)

	"""
	For the demonstration of model selection benefit, lets first compute
	cross-validated error using simple and popular kNN.
	"""

	clf_sample = kNN()
	cv_sample = CrossValidation(clf_sample, NFoldPartitioner())

	verbose(1, "Estimating error using a sample classifier")
	error_sample = np.mean(cv_sample(dataset))

	"""
	For the convenience lets define a helpful function which we will use
	twice -- once within cross-validation, and once on the whole dataset
	"""

	def select_best_clf(dataset_, clfs):
	"""Select best model according to CVTE

	Helper function which we will use twice -- once for proper nested
	cross-validation, and once to see how big an optimistic bias due
	to model selection could be if we simply provide an entire dataset.

	Parameters
	----------
	dataset_ : Dataset
	clfs : list of Classifiers
	Which classifiers to explore

	Returns
	-------
	best_clf, best_error
	"""
	best_clf = None
	best_error = None
	for clf in clfs:
	cv = CrossValidation(clf, NFoldPartitioner())
	# unfortunately we don't have ability to reassign clf atm
	# cv.transerror.clf = clf
	try:
	error = np.mean(cv(dataset_))
	except LearnerError, e:
	# skip the classifier if data was not appropriate and it
	# failed to learn/predict at all
	continue
	if best_error is None or error < best_error:
	best_clf = clf
	best_error = error
	verbose(4, "Classifier %s cv error=%.2f" % (clf.descr, error))
	verbose(3, "Selected the best out of %i classifiers %s with error %.2f"
	% (len(clfs), best_clf.descr, best_error))
	return best_clf, best_error


	# This function will run all classifiers for one single partitions
	def _run_one_partition(isplit, partitions, classifiers):
	best_clfs = {}
	verbose(2, "Processing split #%i" % isplit)
	dstrain, dstest = list(splitter.generate(partitions))
	best_clf, best_error = select_best_clf(dstrain, classifiers)
	best_clfs[best_clf.descr] = best_clfs.get(best_clf.descr, 0) + 1
	# now that we have the best classifier, lets assess its transfer
	# to the testing dataset while training on entire training
	tm = TransferMeasure(best_clf, splitter,
	postproc=BinaryFxNode(mean_mismatch_error,
	space='targets'),
	enable_ca=['stats'])
	tm(partitions)
	return tm, best_clfs

	# Now let's run everything
	best_clfs = {}
	confusion = ConfusionMatrix()
	verbose(1, "Estimating error using nested CV for model selection")
	partitioner = NFoldPartitioner()
	splitter = Splitter('partitions')

	# Here we are using joblib Parallel to parallelize each partition
	# Set n_jobs to the number of available cores (or how many you want to use)
	out_parallel = Parallel(n_jobs=4, backend='threading')(delayed(_run_one_partition)(isplit, partitions, classifiers=clfswh['!gnpp'][:30])
	for isplit, partitions in enumerate(partitioner.generate(dataset)))

	# Parallel retuns a list with the results of each parallel loop, so we need to
	# unravel it to get the confusion matrix
	tms_parallel, best_clfs_parallel = zip(*out_parallel)

	for tm in tms:
	confusion += tm.ca.stats

	# and now we do it also for best_clfs
	best_clfs = {}

	for bc in best_clfs_parallel:
	for key, value in bc.iteritems():
	best_clfs[key] = best_clfs.get(key, 0) + bc[key]