Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Benchmarking `sklearn.semi_supervised` `n_iter_` as a function of model and data characteristics
import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
###for n_samples in [20, 200, 2000, 20000]:
### X, y = datasets.make_classification(n_samples=n_samples, n_classes=3, n_informative=3)
for (X, y) in [datasets.load_iris(return_X_y=True)]:
for model in [LabelPropagation(max_iter=1000),
#LabelSpreading(alpha=0.01),
#LabelSpreading(alpha=0.1),
#LabelSpreading(alpha=0.3)
]:
for p in [.1, .15, .2, .25, .5, .75]:
n_iter = []
for i in range(20):
random_unlabeled_points = np.random.rand(len(y)) < p
labels = np.copy(y)
labels[random_unlabeled_points] = -1
model.fit(X, labels)
n_iter.append(model.n_iter_)
print(len(y), p, model.alpha, np.mean(n_iter), np.std(n_iter), sep='\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.