Skip to content

Instantly share code, notes, and snippets.

@dataframing
Created August 7, 2018 05:16
Show Gist options
  • Save dataframing/4caaf1cf79b2fd1ee1ce36ea3fec6bd2 to your computer and use it in GitHub Desktop.
Save dataframing/4caaf1cf79b2fd1ee1ce36ea3fec6bd2 to your computer and use it in GitHub Desktop.
Batch mode usage
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from collections import deque
from typing import Mapping, Callable
import numpy as np
from scipy.spatial.distance import euclidean
def euclidean_sim(*args, **kwargs):
return 1 / (1 + euclidean(*args, **kwargs))
# TODO support the cold-start case as described in paper.
class RankedBatchLearner:
"""Active learning model that not only supports batch queries, but also ranks query outputs in order of importance.
This learner differs from other implementations within modAL in that it allows for a single call to its query
strategy to not only return multiple requested examples, but also (1) have them not fall prey to usual redundancy
concerns when over-extending single-repsonse query methods and (2) have the response in a ranking order, allowing
for offline data acquisition by (potentially many) oracles to be prioritized in context of broader budgets
and resources.
For more details regarding ranked batch-mode active learning, see the original paper by Cardoso et al.:
https://www.sciencedirect.com/science/article/pii/S0020025516313949
:param estimator: core classifier that supports probabilistic classification.
:type estimator: scikit-learn compatible classifier
:param X_training: samples for training our core estimator in preparation for ranked-batch active learning.
:type X_training: numpy.ndarray of shape (n_samples, n_features).
:param y_training: labels for training our core estimator corresponding to :X_training:.
:type y_training: numpy.ndarray of shape (n_samples, )
:param fit_kwargs: keyword arguments for fitting the data. Passed to estimator's `estimator.fit(...)` call.
:type fit_kwargs: dictionary.
"""
def __init__(self, estimator, X_training: np.ndarray=None, y_training: np.ndarray=None, **fit_kwargs: Mapping):
self.X_training = X_training
self.X_training_shape = X_training.shape[0]
assert hasattr(estimator, 'predict_proba'), 'Ranked batch requires estimator with `predict_proba` method.'
self.core_estimator = estimator.fit(X_training, y_training, **fit_kwargs)
def query(self, unlabeled, n_to_rank=100):
"""Query our top :n_to_rank: instances to label from our unlabeled set.
:param unlabeled: set of records for which we would like to produce a ranked batch query.
:param n_to_rank: keep only the top :n_to_rank: examples from :unlabeled:. Can be set arbitrarily large.
:return: array containing indices that correspond to their example in :unlabeled:. This array can
act as a mask for identifying, in-order, the records most worth labeling.
"""
# Compute :u_uncertainty:, our least confident uncertainty score for each u in U.
_predictions = self.core_estimator.predict_proba(unlabeled)
_uncertainty_score = 1 - np.max(_predictions, axis=1)
_uncertainty_score = np.expand_dims(_uncertainty_score, axis=1)
unlabeled_uncertainty = np.concatenate((unlabeled, _uncertainty_score), axis=1)
labeled = np.copy(self.X_training)
unlabeled_uncertainty_copy = np.copy(unlabeled_uncertainty)
instance_index_ranking = deque()
ceiling = np.minimum(unlabeled.shape[0], n_to_rank)
for _ in range(ceiling):
raw_instance, _ = self.select_instance(labeled, unlabeled_uncertainty_copy)
instance: np.ndarray = np.expand_dims(raw_instance, axis=1)
instance_index_original = np.where(np.all(unlabeled == instance.T, axis=1))[0][0]
instance_index_copy = np.where(np.all(unlabeled_uncertainty_copy[:, :-1] == instance.T, axis=1))[0][0]
labeled = np.concatenate((labeled, instance.T), axis=0)
unlabeled_uncertainty_copy = np.delete(unlabeled_uncertainty_copy, instance_index_copy, axis=0)
instance_index_ranking.append(instance_index_original)
return np.array(instance_index_ranking)
def select_instance(self, labeled_records, unlabeled_with_pred, similarity_fn=euclidean_sim):
"""
:param labeled_records:
:param unlabeled_with_pred:
:param similarity_fn:
:return:
"""
n_labeled, n_unlabeled = self.X_training_shape, unlabeled_with_pred.shape[0]
alpha = n_unlabeled / (n_unlabeled + n_labeled)
best_record = None
best_score = -1.0
for record in unlabeled_with_pred:
unlabeled_record, estimator_uncertainty_score = record[:-1], record[-1]
similarity_score = most_similar(labeled_records, unlabeled_record, similarity_fn)
score = alpha * (1 - similarity_score) + (1 - alpha) * estimator_uncertainty_score
if score > best_score:
best_record = unlabeled_record
best_score = score
return best_record, best_score
def most_similar(pool: np.ndarray, target: np.ndarray, similarity_fn: Callable):
"""Given a pool of labeled examples, find the example that is closest to our target via our similarity function.
:param pool: set of vectors (both from the training set and unlabeled examples added)
:param target: vector of feature values. We're looking for this vector's most similar entry within :pool:.
:param similarity_fn: a similarity function that maps the distance between two vectors to [0, 1].
:return: the similarity between :target: and a vector in :pool: that is closest to 1.
"""
return np.max([similarity_fn(x_i, target) for x_i in pool])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment