dataframing/0_demo_rbmal.ipynb Secret

## 0_demo_rbmal.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              0_demo_rbmal.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## local_batch_mode.py
from collections import deque
from typing import Mapping, Callable

import numpy as np
from scipy.spatial.distance import euclidean


def euclidean_sim(*args, **kwargs):
    return 1 / (1 + euclidean(*args, **kwargs))


# TODO support the cold-start case as described in paper.
class RankedBatchLearner:
    """Active learning model that not only supports batch queries, but also ranks query outputs in order of importance.

    This learner differs from other implementations within modAL in that it allows for a single call to its query
    strategy to not only return multiple requested examples, but also (1) have them not fall prey to usual redundancy
    concerns when over-extending single-repsonse query methods and (2) have the response in a ranking order, allowing
    for offline data acquisition by (potentially many) oracles to be prioritized in context of broader budgets
    and resources.

    For more details regarding ranked batch-mode active learning, see the original paper by Cardoso et al.:

        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    :param estimator: core classifier that supports probabilistic classification.
    :type estimator: scikit-learn compatible classifier

    :param X_training: samples for training our core estimator in preparation for ranked-batch active learning.
    :type X_training: numpy.ndarray of shape (n_samples, n_features).

    :param y_training: labels for training our core estimator corresponding to :X_training:.
    :type y_training: numpy.ndarray of shape (n_samples, )

    :param fit_kwargs: keyword arguments for fitting the data. Passed to estimator's `estimator.fit(...)` call.
    :type fit_kwargs: dictionary.
    """

    def __init__(self, estimator, X_training: np.ndarray=None, y_training: np.ndarray=None, **fit_kwargs: Mapping):

        self.X_training = X_training
        self.X_training_shape = X_training.shape[0]

        assert hasattr(estimator, 'predict_proba'), 'Ranked batch requires estimator with `predict_proba` method.'
        self.core_estimator = estimator.fit(X_training, y_training, **fit_kwargs)

    def query(self, unlabeled, n_to_rank=100):
        """Query our top :n_to_rank: instances to label from our unlabeled set.

        :param unlabeled: set of records for which we would like to produce a ranked batch query.
        :param n_to_rank: keep only the top :n_to_rank: examples from :unlabeled:. Can be set arbitrarily large.
        :return: array containing indices that correspond to their example in :unlabeled:. This array can
            act as a mask for identifying, in-order, the records most worth labeling.
        """

        # Compute :u_uncertainty:, our least confident uncertainty score for each u in U.
        _predictions = self.core_estimator.predict_proba(unlabeled)
        _uncertainty_score = 1 - np.max(_predictions, axis=1)
        _uncertainty_score = np.expand_dims(_uncertainty_score, axis=1)
        unlabeled_uncertainty = np.concatenate((unlabeled, _uncertainty_score), axis=1)

        labeled = np.copy(self.X_training)
        unlabeled_uncertainty_copy = np.copy(unlabeled_uncertainty)

        instance_index_ranking = deque()
        ceiling = np.minimum(unlabeled.shape[0], n_to_rank)

        for _ in range(ceiling):

            raw_instance, _ = self.select_instance(labeled, unlabeled_uncertainty_copy)
            instance: np.ndarray = np.expand_dims(raw_instance, axis=1)

            instance_index_original = np.where(np.all(unlabeled == instance.T, axis=1))[0][0]
            instance_index_copy = np.where(np.all(unlabeled_uncertainty_copy[:, :-1] == instance.T, axis=1))[0][0]

            labeled = np.concatenate((labeled, instance.T), axis=0)
            unlabeled_uncertainty_copy = np.delete(unlabeled_uncertainty_copy, instance_index_copy, axis=0)

            instance_index_ranking.append(instance_index_original)

        return np.array(instance_index_ranking)

    def select_instance(self, labeled_records, unlabeled_with_pred, similarity_fn=euclidean_sim):
        """

        :param labeled_records:
        :param unlabeled_with_pred:
        :param similarity_fn:
        :return:
        """

        n_labeled, n_unlabeled = self.X_training_shape, unlabeled_with_pred.shape[0]
        alpha = n_unlabeled / (n_unlabeled + n_labeled)

        best_record = None
        best_score = -1.0

        for record in unlabeled_with_pred:

            unlabeled_record, estimator_uncertainty_score = record[:-1], record[-1]
            similarity_score = most_similar(labeled_records, unlabeled_record, similarity_fn)

            score = alpha * (1 - similarity_score) + (1 - alpha) * estimator_uncertainty_score

            if score > best_score:
                best_record = unlabeled_record
                best_score = score

        return best_record, best_score


def most_similar(pool: np.ndarray, target: np.ndarray, similarity_fn: Callable):
    """Given a pool of labeled examples, find the example that is closest to our target via our similarity function.

    :param pool: set of vectors (both from the training set and unlabeled examples added)
    :param target: vector of feature values. We're looking for this vector's most similar entry within :pool:.
    :param similarity_fn: a similarity function that maps the distance between two vectors to [0, 1].
    :return: the similarity between :target: and a vector in :pool: that is closest to 1.
    """
    return np.max([similarity_fn(x_i, target) for x_i in pool])
	from collections import deque
	from typing import Mapping, Callable

	import numpy as np
	from scipy.spatial.distance import euclidean


	def euclidean_sim(args, *kwargs):
	return 1 / (1 + euclidean(args, *kwargs))


	# TODO support the cold-start case as described in paper.
	class RankedBatchLearner:
	"""Active learning model that not only supports batch queries, but also ranks query outputs in order of importance.

	This learner differs from other implementations within modAL in that it allows for a single call to its query
	strategy to not only return multiple requested examples, but also (1) have them not fall prey to usual redundancy
	concerns when over-extending single-repsonse query methods and (2) have the response in a ranking order, allowing
	for offline data acquisition by (potentially many) oracles to be prioritized in context of broader budgets
	and resources.

	For more details regarding ranked batch-mode active learning, see the original paper by Cardoso et al.:

	https://www.sciencedirect.com/science/article/pii/S0020025516313949

	:param estimator: core classifier that supports probabilistic classification.
	:type estimator: scikit-learn compatible classifier

	:param X_training: samples for training our core estimator in preparation for ranked-batch active learning.
	:type X_training: numpy.ndarray of shape (n_samples, n_features).

	:param y_training: labels for training our core estimator corresponding to :X_training:.
	:type y_training: numpy.ndarray of shape (n_samples, )

	:param fit_kwargs: keyword arguments for fitting the data. Passed to estimator's `estimator.fit(...)` call.
	:type fit_kwargs: dictionary.
	"""

	def __init__(self, estimator, X_training: np.ndarray=None, y_training: np.ndarray=None, **fit_kwargs: Mapping):

	self.X_training = X_training
	self.X_training_shape = X_training.shape[0]

	assert hasattr(estimator, 'predict_proba'), 'Ranked batch requires estimator with `predict_proba` method.'
	self.core_estimator = estimator.fit(X_training, y_training, **fit_kwargs)

	def query(self, unlabeled, n_to_rank=100):
	"""Query our top :n_to_rank: instances to label from our unlabeled set.

	:param unlabeled: set of records for which we would like to produce a ranked batch query.
	:param n_to_rank: keep only the top :n_to_rank: examples from :unlabeled:. Can be set arbitrarily large.
	:return: array containing indices that correspond to their example in :unlabeled:. This array can
	act as a mask for identifying, in-order, the records most worth labeling.
	"""

	# Compute :u_uncertainty:, our least confident uncertainty score for each u in U.
	_predictions = self.core_estimator.predict_proba(unlabeled)
	_uncertainty_score = 1 - np.max(_predictions, axis=1)
	_uncertainty_score = np.expand_dims(_uncertainty_score, axis=1)
	unlabeled_uncertainty = np.concatenate((unlabeled, _uncertainty_score), axis=1)

	labeled = np.copy(self.X_training)
	unlabeled_uncertainty_copy = np.copy(unlabeled_uncertainty)

	instance_index_ranking = deque()
	ceiling = np.minimum(unlabeled.shape[0], n_to_rank)

	for _ in range(ceiling):

	raw_instance, _ = self.select_instance(labeled, unlabeled_uncertainty_copy)
	instance: np.ndarray = np.expand_dims(raw_instance, axis=1)

	instance_index_original = np.where(np.all(unlabeled == instance.T, axis=1))[0][0]
	instance_index_copy = np.where(np.all(unlabeled_uncertainty_copy[:, :-1] == instance.T, axis=1))[0][0]

	labeled = np.concatenate((labeled, instance.T), axis=0)
	unlabeled_uncertainty_copy = np.delete(unlabeled_uncertainty_copy, instance_index_copy, axis=0)

	instance_index_ranking.append(instance_index_original)

	return np.array(instance_index_ranking)

	def select_instance(self, labeled_records, unlabeled_with_pred, similarity_fn=euclidean_sim):
	"""

	:param labeled_records:
	:param unlabeled_with_pred:
	:param similarity_fn:
	:return:
	"""

	n_labeled, n_unlabeled = self.X_training_shape, unlabeled_with_pred.shape[0]
	alpha = n_unlabeled / (n_unlabeled + n_labeled)

	best_record = None
	best_score = -1.0

	for record in unlabeled_with_pred:

	unlabeled_record, estimator_uncertainty_score = record[:-1], record[-1]
	similarity_score = most_similar(labeled_records, unlabeled_record, similarity_fn)

	score = alpha * (1 - similarity_score) + (1 - alpha) * estimator_uncertainty_score

	if score > best_score:
	best_record = unlabeled_record
	best_score = score

	return best_record, best_score


	def most_similar(pool: np.ndarray, target: np.ndarray, similarity_fn: Callable):
	"""Given a pool of labeled examples, find the example that is closest to our target via our similarity function.

	:param pool: set of vectors (both from the training set and unlabeled examples added)
	:param target: vector of feature values. We're looking for this vector's most similar entry within :pool:.
	:param similarity_fn: a similarity function that maps the distance between two vectors to [0, 1].
	:return: the similarity between :target: and a vector in :pool: that is closest to 1.
	"""
	return np.max([similarity_fn(x_i, target) for x_i in pool])