-
-
Save dataframing/4caaf1cf79b2fd1ee1ce36ea3fec6bd2 to your computer and use it in GitHub Desktop.
Batch mode usage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import deque | |
from typing import Mapping, Callable | |
import numpy as np | |
from scipy.spatial.distance import euclidean | |
def euclidean_sim(*args, **kwargs): | |
return 1 / (1 + euclidean(*args, **kwargs)) | |
# TODO support the cold-start case as described in paper. | |
class RankedBatchLearner: | |
"""Active learning model that not only supports batch queries, but also ranks query outputs in order of importance. | |
This learner differs from other implementations within modAL in that it allows for a single call to its query | |
strategy to not only return multiple requested examples, but also (1) have them not fall prey to usual redundancy | |
concerns when over-extending single-repsonse query methods and (2) have the response in a ranking order, allowing | |
for offline data acquisition by (potentially many) oracles to be prioritized in context of broader budgets | |
and resources. | |
For more details regarding ranked batch-mode active learning, see the original paper by Cardoso et al.: | |
https://www.sciencedirect.com/science/article/pii/S0020025516313949 | |
:param estimator: core classifier that supports probabilistic classification. | |
:type estimator: scikit-learn compatible classifier | |
:param X_training: samples for training our core estimator in preparation for ranked-batch active learning. | |
:type X_training: numpy.ndarray of shape (n_samples, n_features). | |
:param y_training: labels for training our core estimator corresponding to :X_training:. | |
:type y_training: numpy.ndarray of shape (n_samples, ) | |
:param fit_kwargs: keyword arguments for fitting the data. Passed to estimator's `estimator.fit(...)` call. | |
:type fit_kwargs: dictionary. | |
""" | |
def __init__(self, estimator, X_training: np.ndarray=None, y_training: np.ndarray=None, **fit_kwargs: Mapping): | |
self.X_training = X_training | |
self.X_training_shape = X_training.shape[0] | |
assert hasattr(estimator, 'predict_proba'), 'Ranked batch requires estimator with `predict_proba` method.' | |
self.core_estimator = estimator.fit(X_training, y_training, **fit_kwargs) | |
def query(self, unlabeled, n_to_rank=100): | |
"""Query our top :n_to_rank: instances to label from our unlabeled set. | |
:param unlabeled: set of records for which we would like to produce a ranked batch query. | |
:param n_to_rank: keep only the top :n_to_rank: examples from :unlabeled:. Can be set arbitrarily large. | |
:return: array containing indices that correspond to their example in :unlabeled:. This array can | |
act as a mask for identifying, in-order, the records most worth labeling. | |
""" | |
# Compute :u_uncertainty:, our least confident uncertainty score for each u in U. | |
_predictions = self.core_estimator.predict_proba(unlabeled) | |
_uncertainty_score = 1 - np.max(_predictions, axis=1) | |
_uncertainty_score = np.expand_dims(_uncertainty_score, axis=1) | |
unlabeled_uncertainty = np.concatenate((unlabeled, _uncertainty_score), axis=1) | |
labeled = np.copy(self.X_training) | |
unlabeled_uncertainty_copy = np.copy(unlabeled_uncertainty) | |
instance_index_ranking = deque() | |
ceiling = np.minimum(unlabeled.shape[0], n_to_rank) | |
for _ in range(ceiling): | |
raw_instance, _ = self.select_instance(labeled, unlabeled_uncertainty_copy) | |
instance: np.ndarray = np.expand_dims(raw_instance, axis=1) | |
instance_index_original = np.where(np.all(unlabeled == instance.T, axis=1))[0][0] | |
instance_index_copy = np.where(np.all(unlabeled_uncertainty_copy[:, :-1] == instance.T, axis=1))[0][0] | |
labeled = np.concatenate((labeled, instance.T), axis=0) | |
unlabeled_uncertainty_copy = np.delete(unlabeled_uncertainty_copy, instance_index_copy, axis=0) | |
instance_index_ranking.append(instance_index_original) | |
return np.array(instance_index_ranking) | |
def select_instance(self, labeled_records, unlabeled_with_pred, similarity_fn=euclidean_sim): | |
""" | |
:param labeled_records: | |
:param unlabeled_with_pred: | |
:param similarity_fn: | |
:return: | |
""" | |
n_labeled, n_unlabeled = self.X_training_shape, unlabeled_with_pred.shape[0] | |
alpha = n_unlabeled / (n_unlabeled + n_labeled) | |
best_record = None | |
best_score = -1.0 | |
for record in unlabeled_with_pred: | |
unlabeled_record, estimator_uncertainty_score = record[:-1], record[-1] | |
similarity_score = most_similar(labeled_records, unlabeled_record, similarity_fn) | |
score = alpha * (1 - similarity_score) + (1 - alpha) * estimator_uncertainty_score | |
if score > best_score: | |
best_record = unlabeled_record | |
best_score = score | |
return best_record, best_score | |
def most_similar(pool: np.ndarray, target: np.ndarray, similarity_fn: Callable): | |
"""Given a pool of labeled examples, find the example that is closest to our target via our similarity function. | |
:param pool: set of vectors (both from the training set and unlabeled examples added) | |
:param target: vector of feature values. We're looking for this vector's most similar entry within :pool:. | |
:param similarity_fn: a similarity function that maps the distance between two vectors to [0, 1]. | |
:return: the similarity between :target: and a vector in :pool: that is closest to 1. | |
""" | |
return np.max([similarity_fn(x_i, target) for x_i in pool]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment