eliorc/should_i_comment.py

## should_i_comment.py
class DataMapCallback(tf.keras.callbacks.Callback):
    """
    Gather training dynamics for data map generation. Assumes a binary or multi-class model, no support for multi label.

    Arguments
    ---------

    - `dataset` (``tf.data.: Dataset``): Usually, as the paper suggests, this is the training dataset. It should be:

        1. Non-shuffled, so each iteration over the dataset should yield samples in the same order
        2. Already batched, the ``.batch(n)`` method should already be applied on this dataset
        3. Should yield batches of ``(features, labels)``, sample weights are not supported

    - | `outputs_to_probabilities` (``Optional[Callable[[Any], tf.Tensor]]``):
        Callable to convert model's output to probabilities. Use this if the model outputs logits, dictionary or any
        other form which is not a tensor of probabilities. Defaults to ``None``.

    - | `sparse_labels` (``bool``): Set to ``True`` if the labels are given as integers (not one hot encoded). Defaults
        to ``False``.

    Attributes
    ----------

    - | `gold_labels_probabilities` (``np.ndarray``): Gold label predicted probabilities. With the shape of
        ``(n_samples, n_epochs)`` and ``(i, j)`` is the probability of the gold label for sample ``i`` at epoch ``j``.
    - `confidence` (``np.ndarray``): Mean of true label probability across epochs.
    - `variability` (``np.ndarray``): Standard deviation of true label probability across epochs.
    - `correctness` (``np.ndarray``): Fraction of times correctly predicted across epochs


    Examples
    --------

    Calculate training dynamics during training

    .. code-block:: python3

        import tensorflow as tf
        import tavolo as tvl

        # Load dataset
        train = ... # Instance of dataset
        train_unshuffled = ... # Instance of dataset, unshuffled so that each iteration over the dataset would yield
                               # samples in the same order

        # Prepare
        train = train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
        train = train_unshuffled.batch(BATCH_SIZE * 10)  # No gradient updates in data map, can use bigger batches

        # Create the datamap callback
        datamap = tvl.learning.DatMaCallback(dataset=train_unshuffled)

        # Train
        model.fit(train, epochs=N_EPOCHS, callbacks=[datamap])

        # Get training dynamics
        confidence, variability, correctness = datamap.confidence, datamap.variability, datamap.correctness


    Calculate training dynamics from a model that outputs logits (and NOT probabilities)

    .. code-block:: python3

        import tensorflow as tf
        import tavolo as tvl

        # Create the datamap callback - using the outputs_to_predictions option
        datamap = tvl.learning.DatMaCallback(dataset=train_unshuffled, outputs_to_probabilities=tf.nn.softmax)

        # Train
        model.fit(train, epochs=N_EPOCHS, callbacks=[datamap])

    References
    ----------

    - `Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics`_

    .. _`Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics`: https://arxiv.org/pdf/2009.10795

    """

    # TODO - The implementation saves all the gold label probabilities across epochs for the training dynamics
    #        computations. This can be optimized by calculating a running version of each training dynamic.
    #        Once tfp.stats releases RunningVariance and RunningMean to the stable tfp versions - training dynamics
    #        calculations should be reimplemented doing this, thus avoiding (n_epochs - 1) * n_samples floating points
    #        memory usage.

    def __init__(self, dataset: tf.data.Dataset,
                 outputs_to_probabilities: Optional[Callable[[Any], tf.Tensor]] = None,
                 sparse_labels: bool = False):
        """

        :param dataset: Dataset. Usually, as the paper suggests, this is the training dataset. It should be:

             - Non-shuffled, so each iteration over the dataset should yield samples in the same order
             - Already batched, the ``.batch(n)`` method should already be applied on this dataset
             - Should yield batches of ``(features, labels)``, sample weights are not supported

        :param outputs_to_probabilities: Callable to convert model's output to probabilities. Use this if the model
            outputs logits, dictionary or any other form which is not a vector of probabilities.
        :param sparse_labels: Set to ``True`` if the labels are given as integers (not one hot encoded)
        """

        self._dataset = dataset
        self._outputs2probabilities = outputs_to_probabilities
        self._sparse_labels = sparse_labels
        self._gold_labels_probabilities = None

    def on_epoch_end(self, epoch, logs=None):

        gold_label_probabilities = list()
        for x, y in self._dataset:
            probabilities = self.model.predict(x)

            if self._outputs2probabilities is not None:
                probabilities = self._outputs2probabilities(probabilities)

            if self._sparse_labels:
                y = tf.one_hot(y, depth=probabilities.shape[-1])

            if tf.rank(tf.squeeze(y)) == 1:
                probabilities, y = tf.squeeze(probabilities), tf.squeeze(y)

                batch_gold_label_probabilities = tf.where(y == 0, 1 - probabilities, probabilities)
            elif tf.rank(tf.squeeze(y)) == 2:
                if not tf.reduce_all(tf.reduce_sum(tf.cast(y == 1, tf.int32), axis=-1) == 1):
                    raise ValueError('DataMapCallback does not support multi-label classification')

                batch_gold_label_probabilities = tf.boolean_mask(probabilities, tf.cast(y, tf.bool)).numpy()
            else:
                raise ValueError(
                    'tf.squeeze(y) (y == labels from the dataset) must be of rank 1 for binary classification or '
                    '2 for multi class. Instead got ({})'.format(tf.rank(tf.squeeze(y))))

            gold_label_probabilities = np.append(gold_label_probabilities, [batch_gold_label_probabilities])

        if self._gold_labels_probabilities is None:
            self._gold_labels_probabilities = np.expand_dims(gold_label_probabilities, axis=-1)
        else:
            stack = [self._gold_labels_probabilities, np.expand_dims(gold_label_probabilities, axis=-1)]
            self._gold_labels_probabilities = np.hstack(stack)

    @property
    def gold_labels_probabilities(self) -> np.ndarray:
        """
        Gold label predicted probabilities. With the shape of ``(n_samples, n_epochs)`` and ``(i, j)`` is the
        probability of the gold label for sample ``i`` at epoch ``j``

        :return: Gold label probabilities
        """

        return self._gold_labels_probabilities

    @property
    def confidence(self) -> np.ndarray:
        """
        Mean of true label probability across epochs

        :return: Confidence
        """
        return np.mean(self._gold_labels_probabilities, axis=-1)

    @property
    def variability(self) -> np.ndarray:
        """
        Standard deviation of true label probability across epochs

        :return: Variability
        """

        return np.std(self._gold_labels_probabilities, axis=-1)

    @property
    def correctness(self) -> np.ndarray:
        """
        Fraction of times correctly predicted across epochs

        :return: Correctness
        """
        return np.mean(self._gold_labels_probabilities > 0.5, axis=-1)
	class DataMapCallback(tf.keras.callbacks.Callback):
	"""
	Gather training dynamics for data map generation. Assumes a binary or multi-class model, no support for multi label.

	Arguments
	---------

	- `dataset` (``tf.data.: Dataset``): Usually, as the paper suggests, this is the training dataset. It should be:

	1. Non-shuffled, so each iteration over the dataset should yield samples in the same order
	2. Already batched, the ``.batch(n)`` method should already be applied on this dataset
	3. Should yield batches of ``(features, labels)``, sample weights are not supported

	- \| `outputs_to_probabilities` (``Optional[Callable[[Any], tf.Tensor]]``):
	Callable to convert model's output to probabilities. Use this if the model outputs logits, dictionary or any
	other form which is not a tensor of probabilities. Defaults to ``None``.

	- \| `sparse_labels` (``bool``): Set to ``True`` if the labels are given as integers (not one hot encoded). Defaults
	to ``False``.

	Attributes
	----------

	- \| `gold_labels_probabilities` (``np.ndarray``): Gold label predicted probabilities. With the shape of
	``(n_samples, n_epochs)`` and ``(i, j)`` is the probability of the gold label for sample ``i`` at epoch ``j``.
	- `confidence` (``np.ndarray``): Mean of true label probability across epochs.
	- `variability` (``np.ndarray``): Standard deviation of true label probability across epochs.
	- `correctness` (``np.ndarray``): Fraction of times correctly predicted across epochs


	Examples
	--------

	Calculate training dynamics during training

	.. code-block:: python3

	import tensorflow as tf
	import tavolo as tvl

	# Load dataset
	train = ... # Instance of dataset
	train_unshuffled = ... # Instance of dataset, unshuffled so that each iteration over the dataset would yield
	# samples in the same order

	# Prepare
	train = train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
	train = train_unshuffled.batch(BATCH_SIZE * 10) # No gradient updates in data map, can use bigger batches

	# Create the datamap callback
	datamap = tvl.learning.DatMaCallback(dataset=train_unshuffled)

	# Train
	model.fit(train, epochs=N_EPOCHS, callbacks=[datamap])

	# Get training dynamics
	confidence, variability, correctness = datamap.confidence, datamap.variability, datamap.correctness


	Calculate training dynamics from a model that outputs logits (and NOT probabilities)

	.. code-block:: python3

	import tensorflow as tf
	import tavolo as tvl

	# Create the datamap callback - using the outputs_to_predictions option
	datamap = tvl.learning.DatMaCallback(dataset=train_unshuffled, outputs_to_probabilities=tf.nn.softmax)

	# Train
	model.fit(train, epochs=N_EPOCHS, callbacks=[datamap])

	References
	----------

	- `Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics`_

	.. _`Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics`: https://arxiv.org/pdf/2009.10795

	"""

	# TODO - The implementation saves all the gold label probabilities across epochs for the training dynamics
	# computations. This can be optimized by calculating a running version of each training dynamic.
	# Once tfp.stats releases RunningVariance and RunningMean to the stable tfp versions - training dynamics
	# calculations should be reimplemented doing this, thus avoiding (n_epochs - 1) * n_samples floating points
	# memory usage.

	def __init__(self, dataset: tf.data.Dataset,
	outputs_to_probabilities: Optional[Callable[[Any], tf.Tensor]] = None,
	sparse_labels: bool = False):
	"""

	:param dataset: Dataset. Usually, as the paper suggests, this is the training dataset. It should be:

	- Non-shuffled, so each iteration over the dataset should yield samples in the same order
	- Already batched, the ``.batch(n)`` method should already be applied on this dataset
	- Should yield batches of ``(features, labels)``, sample weights are not supported

	:param outputs_to_probabilities: Callable to convert model's output to probabilities. Use this if the model
	outputs logits, dictionary or any other form which is not a vector of probabilities.
	:param sparse_labels: Set to ``True`` if the labels are given as integers (not one hot encoded)
	"""

	self._dataset = dataset
	self._outputs2probabilities = outputs_to_probabilities
	self._sparse_labels = sparse_labels
	self._gold_labels_probabilities = None

	def on_epoch_end(self, epoch, logs=None):

	gold_label_probabilities = list()
	for x, y in self._dataset:
	probabilities = self.model.predict(x)

	if self._outputs2probabilities is not None:
	probabilities = self._outputs2probabilities(probabilities)

	if self._sparse_labels:
	y = tf.one_hot(y, depth=probabilities.shape[-1])

	if tf.rank(tf.squeeze(y)) == 1:
	probabilities, y = tf.squeeze(probabilities), tf.squeeze(y)

	batch_gold_label_probabilities = tf.where(y == 0, 1 - probabilities, probabilities)
	elif tf.rank(tf.squeeze(y)) == 2:
	if not tf.reduce_all(tf.reduce_sum(tf.cast(y == 1, tf.int32), axis=-1) == 1):
	raise ValueError('DataMapCallback does not support multi-label classification')

	batch_gold_label_probabilities = tf.boolean_mask(probabilities, tf.cast(y, tf.bool)).numpy()
	else:
	raise ValueError(
	'tf.squeeze(y) (y == labels from the dataset) must be of rank 1 for binary classification or '
	'2 for multi class. Instead got ({})'.format(tf.rank(tf.squeeze(y))))

	gold_label_probabilities = np.append(gold_label_probabilities, [batch_gold_label_probabilities])

	if self._gold_labels_probabilities is None:
	self._gold_labels_probabilities = np.expand_dims(gold_label_probabilities, axis=-1)
	else:
	stack = [self._gold_labels_probabilities, np.expand_dims(gold_label_probabilities, axis=-1)]
	self._gold_labels_probabilities = np.hstack(stack)

	@property
	def gold_labels_probabilities(self) -> np.ndarray:
	"""
	Gold label predicted probabilities. With the shape of ``(n_samples, n_epochs)`` and ``(i, j)`` is the
	probability of the gold label for sample ``i`` at epoch ``j``

	:return: Gold label probabilities
	"""

	return self._gold_labels_probabilities

	@property
	def confidence(self) -> np.ndarray:
	"""
	Mean of true label probability across epochs

	:return: Confidence
	"""
	return np.mean(self._gold_labels_probabilities, axis=-1)

	@property
	def variability(self) -> np.ndarray:
	"""
	Standard deviation of true label probability across epochs

	:return: Variability
	"""

	return np.std(self._gold_labels_probabilities, axis=-1)

	@property
	def correctness(self) -> np.ndarray:
	"""
	Fraction of times correctly predicted across epochs

	:return: Correctness
	"""
	return np.mean(self._gold_labels_probabilities > 0.5, axis=-1)