nlgranger/command_and_output

## command_and_output
$ nosetests .
Using cuDNN version 7005 on context None
Preallocating 9494/11170 Mb (0.850000) on cuda1
Mapped name None to device cuda1: GeForce GTX 1080 Ti (0000:03:00.0)
EEE
======================================================================
ERROR: test_forward_backward (test_ctc.TestCTC)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/granger/bug/test_ctc.py", line 58, in test_forward_backward
    alphas = alphas.eval()
  File "/home/granger/dev/Theano/theano/gof/graph.py", line 522, in eval
    self._fn_cache[inputs] = theano.function(inputs, self)
  File "/home/granger/dev/Theano/theano/compile/function.py", line 317, in function
    output_keys=output_keys)
  File "/home/granger/dev/Theano/theano/compile/pfunc.py", line 486, in pfunc
    output_keys=output_keys)
  File "/home/granger/dev/Theano/theano/compile/function_module.py", line 1839, in orig_function
    name=name)
  File "/home/granger/dev/Theano/theano/compile/function_module.py", line 1519, in __init__
    optimizer_profile = optimizer(fgraph)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 108, in __call__
    return self.optimize(fgraph)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 97, in optimize
    ret = self.apply(fgraph, *args, **kwargs)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 263, in apply
    self.failure_callback(e, self, optimizer)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 202, in warn
    raise exc
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 251, in apply
    sub_prof = optimizer.optimize(fgraph)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 97, in optimize
    ret = self.apply(fgraph, *args, **kwargs)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 2513, in apply
    lopt_change = self.process_node(fgraph, node, lopt)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 2039, in process_node
    lopt, node)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 1933, in warn_inplace
    return NavigatorOptimizer.warn(exc, nav, repl_pairs, local_opt, node)
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 1919, in warn
      raise exc                                                                                                                                        [210/1233]
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 2034, in process_node
    replacements = lopt.transform(node)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4989, in transform
    num, denum = self.simplify(list(orig_num), list(orig_denum), out.type)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4833, in simplify
    out_type=out_type)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4919, in simplify_constants
    out_type=out_type)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 6328, in add_calculate
    v = reduce(np.add, num, zero) - reduce(np.add, denum, zero)
TypeError: numpy boolean subtract, the `-` operator, is deprecated, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

-------------------- >> begin captured logging << --------------------
theano.gof.opt: ERROR: Optimization failure due to: local_add_canonizer
theano.gof.opt: ERROR: node: Elemwise{add,no_inplace}(InplaceDimShuffle{0,1,x}.0, InplaceDimShuffle{x,0,1}.0)
theano.gof.opt: ERROR: TRACEBACK:
theano.gof.opt: ERROR: Traceback (most recent call last):
  File "/home/granger/dev/Theano/theano/gof/opt.py", line 2034, in process_node
    replacements = lopt.transform(node)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4989, in transform
    num, denum = self.simplify(list(orig_num), list(orig_denum), out.type)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4833, in simplify
    out_type=out_type)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4919, in simplify_constants
    out_type=out_type)
  File "/home/granger/dev/Theano/theano/tensor/opt.py", line 6328, in add_calculate
    v = reduce(np.add, num, zero) - reduce(np.add, denum, zero)
TypeError: numpy boolean subtract, the `-` operator, is deprecated, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

.....
.....
..... > more error output

## ctc.py
# Author: Nicolas Granger <nicolas.granger.m@gmail.com>
#
# Implements the connectionist temporal classification loss from:
# Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June).
# Connectionist temporal classification: labelling unsegmented sequence data
# with recurrent neural networks. In Proceedings of the 23rd international
# conference on Machine learning (pp. 369-376). ACM.
# ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf

import numpy as np
import theano
import theano.tensor as T
from theano.tensor import discrete_dtypes, continuous_dtypes


def isneginf(x, neginf=-1e9):
    return x < neginf


def logaddexp(x, y, magnitude=20):
    x, y = T.minimum(x, y), T.maximum(x, y)
    diff = T.minimum(y - x, magnitude)
    res = x + T.log(1 + T.exp(diff))
    return T.switch((y - x > magnitude), y, res)


def logsumexp(x, axis, keepdims=False):
    k = T.max(x, axis=axis, keepdims=True)
    res = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + k
    return T.switch(isneginf(k), -2e9, res)


def log_softmax(X, axis=-1, clip=None):
    k = T.max(X, axis=axis, keepdims=True)
    norm_X = X - k

    if clip is not None:
        norm_X = T.maximum(norm_X, clip)

    log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True)
    return norm_X - log_sum_exp_X


# Bits of the CTC algorithm ---------------------------------------------------

def insert_alternating_blanks(labels, blank_label):
    batch_size, label_size = labels.shape
    blanked_labels = T.zeros((batch_size, 2 * label_size + 1), dtype=np.int32)
    blanked_labels = T.set_subtensor(blanked_labels[:, 0::2], blank_label)
    blanked_labels = T.set_subtensor(blanked_labels[:, 1:-1:2], labels)
    return blanked_labels


def ctc_forward(log_odds, seq_sizes,
                blanked_labels, label_sizes, not_repeated):
    seqsize, batch_sz, _ = log_odds.shape
    label_size = blanked_labels.shape[1]

    def step(t, a_tm1, log_odds_,
             seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
        y_t = log_odds_[t]
        a_t = a_tm1
        a_t = T.set_subtensor(
            a_t[:, 1:],
            logaddexp(a_t[:, 1:], a_tm1[:, :-1]))
        a_t = T.set_subtensor(
            a_t[:, 2:],
            logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9)))

        # stop after a_T(|l'|)
        mask = T.ge(t, seq_sizes_)[:, None] \
            + T.ge(T.arange(label_size)[None, :],
                   2 * label_sizes_[:, None] + 1)

        a_t = T.switch(
            isneginf(a_t) + mask, -2e9,
            a_t + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
        return a_t

    alpha_init = -2e9 * T.ones((batch_sz, label_size))
    alpha_init = T.set_subtensor(alpha_init[:, 0], 0)

    alphas, _ = theano.scan(
        fn=step,
        sequences=[T.arange(seqsize)],
        outputs_info=alpha_init,
        non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
                       not_repeated],
        name="ctc_forward")

    return alphas


def ctc_backward(log_odds, seq_sizes,
                 blanked_labels, label_sizes, not_repeated):
    seqsize, batch_sz, _ = log_odds.shape
    label_size = blanked_labels.shape[1]

    def step(t, b_tp1, log_odds_,
             seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
        y_t = log_odds_[t]

        # increase b_{T+1}(|l'|) from 0 to 1 to initialize recursion
        starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \
            * T.eq((2 * label_sizes_)[:, None],
                   T.arange(label_size)[None, :])
        b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_]
        b_tp1 = T.set_subtensor(
            b_tp1_2lp1,
            T.switch(T.eq(t, seq_sizes_ - 1), 0, b_tp1_2lp1))
        b_tp1 = T.switch(starter_t, 0, b_tp1)  # initialize recursion

        b_t = b_tp1
        b_t = T.set_subtensor(
            b_t[:, :-1],
            logaddexp(b_t[:, :-1], b_tp1[:, 1:]))
        b_t = T.set_subtensor(
            b_t[:, :-2],
            logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9)))
        b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_]
        b_t = T.switch(isneginf(b_t), -2e9, b_t)
        return b_t

    beta_init = -2e9 * T.ones((batch_sz, label_size))

    betas, _ = theano.scan(
        fn=step,
        sequences=[T.arange(seqsize)],
        outputs_info=beta_init,
        non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
                       not_repeated],
        go_backwards=True,
        name="ctc_backward")
    betas = betas[::-1, :, :]

    return betas


# Theano Op -------------------------------------------------------------------

def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank):
    _, batch_size, voca_size = linout.shape

    logits = log_softmax(linout)
    blanked_labels = insert_alternating_blanks(labels, blank)
    not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
    betas = ctc_backward(logits, seq_sizes,
                         blanked_labels, label_sizes, not_repeated)
    loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])

    # alphas = ctc_forward(logits, seq_sizes,
    #                      blanked_labels, label_sizes, not_repeated)
    # loss = - logaddexp(
    #     alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes - 1],
    #     alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes])

    return logits, blanked_labels, not_repeated, betas, loss


def ctc_grad_graph(inputs, output_gradients):
    linout, seq_durations, labels, label_sizes, _ = inputs
    seq_size, batch_size, voca_size = linout.shape
    label_size = labels.shape[1]

    logits, blanked_labels, not_repeated, betas, loss = \
        ctc_perform_graph(*inputs)

    alphas = ctc_forward(logits, seq_durations,
                         blanked_labels, label_sizes, not_repeated)

    # log(sum_{s \in lab(l, k)} a_t(s) b_t(s))
    def fwbw_sum_step(k, s, labels_, ab_):
        s_view = s[:, T.arange(batch_size), labels_[:, k]]
        ab_view = ab_[:, :, k]
        next_sum = logaddexp(s_view, ab_view)
        s = T.set_subtensor(s_view, next_sum)
        return s

    ab = alphas + betas
    fwbw_sum = theano.scan(
        fn=fwbw_sum_step,
        sequences=[T.arange(2 * label_size + 1)],
        outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)),
        non_sequences=[blanked_labels, ab],
        strict=True,
        name="fwbw_sum")[0][-1]

    A = loss[None, :, None] + logits \
        + logsumexp(fwbw_sum - logits, axis=2, keepdims=True)
    B = loss[None, :, None] + fwbw_sum - logits
    dloss_dy = T.exp(A) - T.exp(B)

    dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True),
                        0, dloss_dy)

    return [dloss_dy * output_gradients[0][None, :, None],
            theano.gradient.disconnected_type(),
            theano.gradient.disconnected_type(),
            theano.gradient.disconnected_type(),
            theano.gradient.disconnected_type()]


def make_ctc_op():
    preds_var = T.tensor3()
    seq_durations_var = T.ivector()
    labels_var = T.imatrix()
    label_sizes_var = T.ivector()
    blank_var = T.iscalar()

    _, _, _, _, loss = ctc_perform_graph(
        preds_var, seq_durations_var, labels_var,
        label_sizes_var, blank_var)

    return theano.OpFromGraph(
        inputs=[preds_var, seq_durations_var,
                labels_var, label_sizes_var, blank_var],
        outputs=[loss],
        grad_overrides=ctc_grad_graph,
        inline=True, name="ctcLossOp")


CTCLossOp = make_ctc_op()


# -----------------------------------------------------------------------------

def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
    """Compute the Connectionnist Temporal Classification loss [#graves2006]_.

    .. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi | y)
                      \right)

    where :math:`y` is the sequence of predictions, :math:`l` the target
    label sequence without blanks or repetition, :math:`\pi` is taken from the
    ensemble of possible label assignments over the observations and
    :math:`\mathcal{B}` is a function that remove blanks and repetitions for a
    sequence of labels.

    Parameters
    ----------
    linout : Theano shared variable, expression or numpy array
        The input values for the softmax function with shape
        duration x batch_size x nclasses.
    durations: Theano shared variable, expression or numpy array
        An _integer_ vector of size batch_size contining the actual length of
        each sequence in preds.
    labels: Theano shared variable, expression or numpy array
        An _integer_ matrix of size batch_size x label_size containing the
        target labels.
    label_sizes: Theano shared variable, expression or numpy array
        An _integer_ vector of size batch_size containing the actual length of
        each sequence in labels.
    blank:
        The blank label class, by default the last index.

    Returns
    -------
    Theano tensor
        A vector expression with the CTC loss of each sequence.

    Reference
    ---------
    .. [#graves2006] Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J.
       (2006, June). Connectionist temporal classification: labelling
       unsegmented sequence data with recurrent neural networks. In
       Proceedings of the 23rd international conference on Machine learning
       (pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
    """
    linout = T.as_tensor_variable(linout)
    durations = T.as_tensor_variable(durations)
    labels = T.as_tensor_variable(labels)
    label_sizes = T.as_tensor_variable(label_sizes)
    blank = T.cast(T.as_tensor_variable(blank), 'int32')

    if not(linout.dtype in continuous_dtypes and linout.ndim == 3):
        raise ValueError("preds must continuous with dimension 3")
    if not (durations.dtype in discrete_dtypes and durations.ndim == 1):
        raise ValueError("durations must be a integer vector")
    if not (labels.dtype in discrete_dtypes and labels.ndim == 2):
        raise ValueError("labels must be an integer matrix")
    if not (label_sizes.dtype in discrete_dtypes and label_sizes.ndim == 1):
        raise ValueError("label_sizes must be an integer vector")
    if not (blank.dtype in discrete_dtypes and blank.ndim == 0):
        raise ValueError("blank must be an integer value")

    voca_size = T.cast(linout.shape[2], 'int32')
    labels = labels % voca_size
    blank = blank % voca_size

    return CTCLossOp(linout, durations, labels, label_sizes, blank)

## test_ctc.py
import unittest
import numpy as np
import theano
import theano.tensor as T
from theano.tests import unittest_tools

from ctc import ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, \
    isneginf, log_softmax


class TestCTC(unittest.TestCase):
    def setUp(self):
        unittest_tools.seed_rng()

    def test_forward_backward(self):
        batch_size = 6
        label_size = 7
        voca_size = 5
        seq_size = 10

        label_lengths = np.random.randint(0, label_size,
                                          size=(batch_size,), dtype=np.int32)
        label_lengths[0] = label_size  # extremum case
        label_lengths[1] = 0  # extremum case
        labels = np.array(
            [np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32)
             for _ in range(batch_size)])
        for i in range(batch_size):
            labels[i, label_lengths[i]:] = -1

        seq_durations = np.array([
            np.random.randint(max(1, label_lengths[i]), seq_size)
            for i in range(batch_size)], dtype=np.int32)

        linear_out = np.random.randn(seq_size, batch_size, voca_size) \
            .astype(np.float32)

        blank_class = -1
        blank_class = np.mod(blank_class, voca_size)

        labels = np.mod(labels, voca_size)

        log_odds = log_softmax(linear_out)
        blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size),
                                                   blank_class)
        not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])

        alphas = ctc_forward(log_odds, seq_durations,
                             blanked_labels, label_lengths, not_repeated)
        betas = ctc_backward(log_odds, seq_durations,
                             blanked_labels, label_lengths, not_repeated)

        preds = log_softmax(linear_out)

        y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels]
        p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2)

        alphas = alphas.eval()
        betas = betas.eval()
        preds = preds.eval()

        for i in range(batch_size):
            assert np.allclose(alphas[0, i, 0], preds[0, i, -1])
            if label_lengths[i] > 0:
                assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
            else:
                assert isneginf(alphas[0, i, 1])
            assert np.all(isneginf(alphas[0, i, 2:]))

        for i in range(batch_size):
            t = seq_durations[i] - 1
            l = label_lengths[i]
            assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1])
            if l > 0:
                assert np.allclose(betas[t, i, 2 * l - 1],
                                   preds[t, i, labels[i, l - 1]])
                assert np.all(isneginf(betas[t, i, :max(l - 2, 0)]))
            else:
                assert np.all(isneginf(betas[t, i, 1:]))

        p_l = p_l.eval()

        for i in range(batch_size):
            assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i]))
            a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1
            p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum()
            assert np.allclose(p_li, p_l[0, i])
            p_li = np.exp(betas[0, i, :2]).sum()
            assert np.allclose(p_li, p_l[0, i])

    def test_simple_precomputed(self):
        # Test obtained from Torch tutorial at:
        # https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md

        linear_out = np.asarray([
            [[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
            [[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
            [[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]
        ], dtype=np.float32)

        seq_sizes = np.asarray([1, 3, 3], dtype=np.int32)

        labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32)

        label_sizes = np.asarray([1, 2, 2], dtype=np.int32)

        expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926],
                                     dtype=np.float32)

        blank = 0

        expected_grad = np.asarray([
            [[0.2,            -0.8,            0.2,            0.2,           0.2],
             [ 0.01165623125,  0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
             [-0.02115798369,  0.03168492019, -0.8810571432,   0.2341216654,  0.636408627]],
            [[0,               0,              0,              0,             0],
             [-0.9883437753,   0.03168492019,  0.08612854034,  0.2341216654,  0.636408627],
             [-0.02115798369,  0.03168492019, -0.1891518533,  -0.4577836394,  0.636408627]],
            [[0,               0,              0,              0,             0],
             [0.01165623125,   0.03168492019,  0.08612854034, -0.7658783197,  0.636408627],
             [-0.02115798369,  0.03168492019,  0.08612854034, -0.7330639958,  0.636408627]]
        ], dtype=np.float32)

        linear_out_var = T.as_tensor_variable(linear_out)
        losses = ctc_loss(
            linear_out_var, seq_sizes, labels, label_sizes, blank)

        assert np.allclose(losses.eval(), expected_losses, atol=1)

        grad = theano.grad(losses.sum(), wrt=linear_out_var)

        assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)

    def test_random(self):
        batch_size = 16
        label_size = 5
        voca_size = 4
        seq_size = 20

        label_sizes = np.random.randint(
            0, label_size, size=(batch_size,), dtype=np.int32)
        label_sizes[0] = label_size
        label_sizes[1] = 0
        label_sizes[2] = 5
        label_sizes[3] = 5

        labels = np.random.randint(
            0, voca_size - 1,
            size=(batch_size, label_size), dtype=np.int32)
        labels[3] = 0

        seq_sizes = np.array([
            np.random.randint(max(1, label_sizes[i]), seq_size)
            for i in range(batch_size)], dtype=np.int32)
        seq_sizes[2] = 4

        linear_out = np.random.randn(
            seq_size, batch_size, voca_size).astype(np.float32)

        # check edge cases
        # TODO

        # check the gradient can be computed at all
        linear_out_var = T.tensor3()
        preds = T.nnet.softmax(
            linear_out_var.reshape((-1, voca_size))
        ).reshape((seq_size, batch_size, voca_size))

        g = theano.grad(ctc_loss(preds, seq_sizes,
                                 labels, label_sizes).sum(),
                        wrt=linear_out_var).eval(
            {linear_out_var: linear_out.astype(np.float32)})
        assert not np.any(np.isnan(g))

        # check correctness against finite difference approximation
        def f(linear_out_):
            preds_ = T.nnet.softmax(
                    linear_out_.reshape((-1, voca_size))
                ).reshape((seq_size, batch_size, voca_size))
            loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
            # prevent finite differences from failing
            loss = T.switch(isneginf(-loss), 0, loss)
            return loss

        unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)
	$ nosetests .
	Using cuDNN version 7005 on context None
	Preallocating 9494/11170 Mb (0.850000) on cuda1
	Mapped name None to device cuda1: GeForce GTX 1080 Ti (0000:03:00.0)
	EEE
	======================================================================
	ERROR: test_forward_backward (test_ctc.TestCTC)
	----------------------------------------------------------------------
	Traceback (most recent call last):
	File "/home/granger/bug/test_ctc.py", line 58, in test_forward_backward
	alphas = alphas.eval()
	File "/home/granger/dev/Theano/theano/gof/graph.py", line 522, in eval
	self._fn_cache[inputs] = theano.function(inputs, self)
	File "/home/granger/dev/Theano/theano/compile/function.py", line 317, in function
	output_keys=output_keys)
	File "/home/granger/dev/Theano/theano/compile/pfunc.py", line 486, in pfunc
	output_keys=output_keys)
	File "/home/granger/dev/Theano/theano/compile/function_module.py", line 1839, in orig_function
	name=name)
	File "/home/granger/dev/Theano/theano/compile/function_module.py", line 1519, in __init__
	optimizer_profile = optimizer(fgraph)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 108, in __call__
	return self.optimize(fgraph)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 97, in optimize
	ret = self.apply(fgraph, args, *kwargs)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 263, in apply
	self.failure_callback(e, self, optimizer)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 202, in warn
	raise exc
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 251, in apply
	sub_prof = optimizer.optimize(fgraph)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 97, in optimize
	ret = self.apply(fgraph, args, *kwargs)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 2513, in apply
	lopt_change = self.process_node(fgraph, node, lopt)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 2039, in process_node
	lopt, node)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 1933, in warn_inplace
	return NavigatorOptimizer.warn(exc, nav, repl_pairs, local_opt, node)
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 1919, in warn
	raise exc [210/1233]
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 2034, in process_node
	replacements = lopt.transform(node)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4989, in transform
	num, denum = self.simplify(list(orig_num), list(orig_denum), out.type)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4833, in simplify
	out_type=out_type)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4919, in simplify_constants
	out_type=out_type)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 6328, in add_calculate
	v = reduce(np.add, num, zero) - reduce(np.add, denum, zero)
	TypeError: numpy boolean subtract, the `-` operator, is deprecated, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

	-------------------- >> begin captured logging << --------------------
	theano.gof.opt: ERROR: Optimization failure due to: local_add_canonizer
	theano.gof.opt: ERROR: node: Elemwise{add,no_inplace}(InplaceDimShuffle{0,1,x}.0, InplaceDimShuffle{x,0,1}.0)
	theano.gof.opt: ERROR: TRACEBACK:
	theano.gof.opt: ERROR: Traceback (most recent call last):
	File "/home/granger/dev/Theano/theano/gof/opt.py", line 2034, in process_node
	replacements = lopt.transform(node)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4989, in transform
	num, denum = self.simplify(list(orig_num), list(orig_denum), out.type)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4833, in simplify
	out_type=out_type)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 4919, in simplify_constants
	out_type=out_type)
	File "/home/granger/dev/Theano/theano/tensor/opt.py", line 6328, in add_calculate
	v = reduce(np.add, num, zero) - reduce(np.add, denum, zero)
	TypeError: numpy boolean subtract, the `-` operator, is deprecated, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

	.....
	.....
	..... > more error output
	# Author: Nicolas Granger <nicolas.granger.m@gmail.com>
	#
	# Implements the connectionist temporal classification loss from:
	# Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J. (2006, June).
	# Connectionist temporal classification: labelling unsegmented sequence data
	# with recurrent neural networks. In Proceedings of the 23rd international
	# conference on Machine learning (pp. 369-376). ACM.
	# ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf

	import numpy as np
	import theano
	import theano.tensor as T
	from theano.tensor import discrete_dtypes, continuous_dtypes


	def isneginf(x, neginf=-1e9):
	return x < neginf


	def logaddexp(x, y, magnitude=20):
	x, y = T.minimum(x, y), T.maximum(x, y)
	diff = T.minimum(y - x, magnitude)
	res = x + T.log(1 + T.exp(diff))
	return T.switch((y - x > magnitude), y, res)


	def logsumexp(x, axis, keepdims=False):
	k = T.max(x, axis=axis, keepdims=True)
	res = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims)) + k
	return T.switch(isneginf(k), -2e9, res)


	def log_softmax(X, axis=-1, clip=None):
	k = T.max(X, axis=axis, keepdims=True)
	norm_X = X - k

	if clip is not None:
	norm_X = T.maximum(norm_X, clip)

	log_sum_exp_X = logsumexp(norm_X, axis=axis, keepdims=True)
	return norm_X - log_sum_exp_X


	# Bits of the CTC algorithm ---------------------------------------------------

	def insert_alternating_blanks(labels, blank_label):
	batch_size, label_size = labels.shape
	blanked_labels = T.zeros((batch_size, 2 * label_size + 1), dtype=np.int32)
	blanked_labels = T.set_subtensor(blanked_labels[:, 0::2], blank_label)
	blanked_labels = T.set_subtensor(blanked_labels[:, 1:-1:2], labels)
	return blanked_labels


	def ctc_forward(log_odds, seq_sizes,
	blanked_labels, label_sizes, not_repeated):
	seqsize, batch_sz, _ = log_odds.shape
	label_size = blanked_labels.shape[1]

	def step(t, a_tm1, log_odds_,
	seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
	y_t = log_odds_[t]
	a_t = a_tm1
	a_t = T.set_subtensor(
	a_t[:, 1:],
	logaddexp(a_t[:, 1:], a_tm1[:, :-1]))
	a_t = T.set_subtensor(
	a_t[:, 2:],
	logaddexp(a_t[:, 2:], T.switch(not_repeated_, a_tm1[:, :-2], -2e9)))

	# stop after a_T(\|l'\|)
	mask = T.ge(t, seq_sizes_)[:, None] \
	+ T.ge(T.arange(label_size)[None, :],
	2 * label_sizes_[:, None] + 1)

	a_t = T.switch(
	isneginf(a_t) + mask, -2e9,
	a_t + y_t[T.arange(batch_sz)[:, None], blanked_labels_])
	return a_t

	alpha_init = -2e9 * T.ones((batch_sz, label_size))
	alpha_init = T.set_subtensor(alpha_init[:, 0], 0)

	alphas, _ = theano.scan(
	fn=step,
	sequences=[T.arange(seqsize)],
	outputs_info=alpha_init,
	non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
	not_repeated],
	name="ctc_forward")

	return alphas


	def ctc_backward(log_odds, seq_sizes,
	blanked_labels, label_sizes, not_repeated):
	seqsize, batch_sz, _ = log_odds.shape
	label_size = blanked_labels.shape[1]

	def step(t, b_tp1, log_odds_,
	seq_sizes_, blanked_labels_, label_sizes_, not_repeated_):
	y_t = log_odds_[t]

	# increase b_{T+1}(\|l'\|) from 0 to 1 to initialize recursion
	starter_t = T.eq(t, seq_sizes_ - 1)[:, None] \
	* T.eq((2 * label_sizes_)[:, None],
	T.arange(label_size)[None, :])
	b_tp1_2lp1 = b_tp1[T.arange(batch_sz), 2 * label_sizes_]
	b_tp1 = T.set_subtensor(
	b_tp1_2lp1,
	T.switch(T.eq(t, seq_sizes_ - 1), 0, b_tp1_2lp1))
	b_tp1 = T.switch(starter_t, 0, b_tp1) # initialize recursion

	b_t = b_tp1
	b_t = T.set_subtensor(
	b_t[:, :-1],
	logaddexp(b_t[:, :-1], b_tp1[:, 1:]))
	b_t = T.set_subtensor(
	b_t[:, :-2],
	logaddexp(b_t[:, :-2], T.switch(not_repeated_, b_tp1[:, 2:], -2e9)))
	b_t += y_t[T.arange(batch_sz)[:, None], blanked_labels_]
	b_t = T.switch(isneginf(b_t), -2e9, b_t)
	return b_t

	beta_init = -2e9 * T.ones((batch_sz, label_size))

	betas, _ = theano.scan(
	fn=step,
	sequences=[T.arange(seqsize)],
	outputs_info=beta_init,
	non_sequences=[log_odds, seq_sizes, blanked_labels, label_sizes,
	not_repeated],
	go_backwards=True,
	name="ctc_backward")
	betas = betas[::-1, :, :]

	return betas


	# Theano Op -------------------------------------------------------------------

	def ctc_perform_graph(linout, seq_sizes, labels, label_sizes, blank):
	_, batch_size, voca_size = linout.shape

	logits = log_softmax(linout)
	blanked_labels = insert_alternating_blanks(labels, blank)
	not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])
	betas = ctc_backward(logits, seq_sizes,
	blanked_labels, label_sizes, not_repeated)
	loss = - logaddexp(betas[0, :, 0], betas[0, :, 1])

	# alphas = ctc_forward(logits, seq_sizes,
	# blanked_labels, label_sizes, not_repeated)
	# loss = - logaddexp(
	# alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes - 1],
	# alphas[seq_sizes - 1, T.arange(batch_size), 2 * label_sizes])

	return logits, blanked_labels, not_repeated, betas, loss


	def ctc_grad_graph(inputs, output_gradients):
	linout, seq_durations, labels, label_sizes, _ = inputs
	seq_size, batch_size, voca_size = linout.shape
	label_size = labels.shape[1]

	logits, blanked_labels, not_repeated, betas, loss = \
	ctc_perform_graph(*inputs)

	alphas = ctc_forward(logits, seq_durations,
	blanked_labels, label_sizes, not_repeated)

	# log(sum_{s \in lab(l, k)} a_t(s) b_t(s))
	def fwbw_sum_step(k, s, labels_, ab_):
	s_view = s[:, T.arange(batch_size), labels_[:, k]]
	ab_view = ab_[:, :, k]
	next_sum = logaddexp(s_view, ab_view)
	s = T.set_subtensor(s_view, next_sum)
	return s

	ab = alphas + betas
	fwbw_sum = theano.scan(
	fn=fwbw_sum_step,
	sequences=[T.arange(2 * label_size + 1)],
	outputs_info=-2e9 * T.ones((seq_size, batch_size, voca_size)),
	non_sequences=[blanked_labels, ab],
	strict=True,
	name="fwbw_sum")[0][-1]

	A = loss[None, :, None] + logits \
	+ logsumexp(fwbw_sum - logits, axis=2, keepdims=True)
	B = loss[None, :, None] + fwbw_sum - logits
	dloss_dy = T.exp(A) - T.exp(B)

	dloss_dy = T.switch(T.all(isneginf(fwbw_sum), axis=2, keepdims=True),
	0, dloss_dy)

	return [dloss_dy * output_gradients[0][None, :, None],
	theano.gradient.disconnected_type(),
	theano.gradient.disconnected_type(),
	theano.gradient.disconnected_type(),
	theano.gradient.disconnected_type()]


	def make_ctc_op():
	preds_var = T.tensor3()
	seq_durations_var = T.ivector()
	labels_var = T.imatrix()
	label_sizes_var = T.ivector()
	blank_var = T.iscalar()

	_, _, _, _, loss = ctc_perform_graph(
	preds_var, seq_durations_var, labels_var,
	label_sizes_var, blank_var)

	return theano.OpFromGraph(
	inputs=[preds_var, seq_durations_var,
	labels_var, label_sizes_var, blank_var],
	outputs=[loss],
	grad_overrides=ctc_grad_graph,
	inline=True, name="ctcLossOp")


	CTCLossOp = make_ctc_op()


	# -----------------------------------------------------------------------------

	def ctc_loss(linout, durations, labels, label_sizes, blank=-1):
	"""Compute the Connectionnist Temporal Classification loss [#graves2006]_.

	.. math:: L = - ln\left( \sum_{\pi \in \mathcal{B}^{-1}(l)} P(\pi \| y)
	\right)

	where :math:`y` is the sequence of predictions, :math:`l` the target
	label sequence without blanks or repetition, :math:`\pi` is taken from the
	ensemble of possible label assignments over the observations and
	:math:`\mathcal{B}` is a function that remove blanks and repetitions for a
	sequence of labels.

	Parameters
	----------
	linout : Theano shared variable, expression or numpy array
	The input values for the softmax function with shape
	duration x batch_size x nclasses.
	durations: Theano shared variable, expression or numpy array
	An _integer_ vector of size batch_size contining the actual length of
	each sequence in preds.
	labels: Theano shared variable, expression or numpy array
	An _integer_ matrix of size batch_size x label_size containing the
	target labels.
	label_sizes: Theano shared variable, expression or numpy array
	An _integer_ vector of size batch_size containing the actual length of
	each sequence in labels.
	blank:
	The blank label class, by default the last index.

	Returns
	-------
	Theano tensor
	A vector expression with the CTC loss of each sequence.

	Reference
	---------
	.. [#graves2006] Graves, A., Fernández, S., Gomez, F., & Schmidhuber, J.
	(2006, June). Connectionist temporal classification: labelling
	unsegmented sequence data with recurrent neural networks. In
	Proceedings of the 23rd international conference on Machine learning
	(pp. 369-376). ACM. ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
	"""
	linout = T.as_tensor_variable(linout)
	durations = T.as_tensor_variable(durations)
	labels = T.as_tensor_variable(labels)
	label_sizes = T.as_tensor_variable(label_sizes)
	blank = T.cast(T.as_tensor_variable(blank), 'int32')

	if not(linout.dtype in continuous_dtypes and linout.ndim == 3):
	raise ValueError("preds must continuous with dimension 3")
	if not (durations.dtype in discrete_dtypes and durations.ndim == 1):
	raise ValueError("durations must be a integer vector")
	if not (labels.dtype in discrete_dtypes and labels.ndim == 2):
	raise ValueError("labels must be an integer matrix")
	if not (label_sizes.dtype in discrete_dtypes and label_sizes.ndim == 1):
	raise ValueError("label_sizes must be an integer vector")
	if not (blank.dtype in discrete_dtypes and blank.ndim == 0):
	raise ValueError("blank must be an integer value")

	voca_size = T.cast(linout.shape[2], 'int32')
	labels = labels % voca_size
	blank = blank % voca_size

	return CTCLossOp(linout, durations, labels, label_sizes, blank)
	import unittest
	import numpy as np
	import theano
	import theano.tensor as T
	from theano.tests import unittest_tools

	from ctc import ctc_loss, ctc_forward, ctc_backward, insert_alternating_blanks, \
	isneginf, log_softmax


	class TestCTC(unittest.TestCase):
	def setUp(self):
	unittest_tools.seed_rng()

	def test_forward_backward(self):
	batch_size = 6
	label_size = 7
	voca_size = 5
	seq_size = 10

	label_lengths = np.random.randint(0, label_size,
	size=(batch_size,), dtype=np.int32)
	label_lengths[0] = label_size # extremum case
	label_lengths[1] = 0 # extremum case
	labels = np.array(
	[np.random.randint(0, voca_size - 1, size=label_size, dtype=np.int32)
	for _ in range(batch_size)])
	for i in range(batch_size):
	labels[i, label_lengths[i]:] = -1

	seq_durations = np.array([
	np.random.randint(max(1, label_lengths[i]), seq_size)
	for i in range(batch_size)], dtype=np.int32)

	linear_out = np.random.randn(seq_size, batch_size, voca_size) \
	.astype(np.float32)

	blank_class = -1
	blank_class = np.mod(blank_class, voca_size)

	labels = np.mod(labels, voca_size)

	log_odds = log_softmax(linear_out)
	blanked_labels = insert_alternating_blanks(T.mod(labels, voca_size),
	blank_class)
	not_repeated = T.neq(blanked_labels[:, 2:], blanked_labels[:, :-2])

	alphas = ctc_forward(log_odds, seq_durations,
	blanked_labels, label_lengths, not_repeated)
	betas = ctc_backward(log_odds, seq_durations,
	blanked_labels, label_lengths, not_repeated)

	preds = log_softmax(linear_out)

	y_blanks = preds[:, T.arange(batch_size)[:, None], blanked_labels]
	p_l = T.sum(T.exp(alphas + betas - y_blanks), axis=2)

	alphas = alphas.eval()
	betas = betas.eval()
	preds = preds.eval()

	for i in range(batch_size):
	assert np.allclose(alphas[0, i, 0], preds[0, i, -1])
	if label_lengths[i] > 0:
	assert np.allclose(alphas[0, i, 1], preds[0, i, labels[i, 0]])
	else:
	assert isneginf(alphas[0, i, 1])
	assert np.all(isneginf(alphas[0, i, 2:]))

	for i in range(batch_size):
	t = seq_durations[i] - 1
	l = label_lengths[i]
	assert np.allclose(betas[t, i, 2 * l], preds[t, i, -1])
	if l > 0:
	assert np.allclose(betas[t, i, 2 * l - 1],
	preds[t, i, labels[i, l - 1]])
	assert np.all(isneginf(betas[t, i, :max(l - 2, 0)]))
	else:
	assert np.all(isneginf(betas[t, i, 1:]))

	p_l = p_l.eval()

	for i in range(batch_size):
	assert (np.allclose(p_l[:seq_durations[i], i], p_l[0, i]))
	a, b = max(0, 2 * label_lengths[i] - 1), 2 * label_lengths[i] + 1
	p_li = np.exp(alphas[seq_durations[i] - 1, i, a:b]).sum()
	assert np.allclose(p_li, p_l[0, i])
	p_li = np.exp(betas[0, i, :2]).sum()
	assert np.allclose(p_li, p_l[0, i])

	def test_simple_precomputed(self):
	# Test obtained from Torch tutorial at:
	# https://github.com/baidu-research/warp-ctc/blob/master/torch_binding/TUTORIAL.md

	linear_out = np.asarray([
	[[0, 0, 0, 0, 0], [1, 2, 3, 4, 5], [-5, -4, -3, -2, -1]],
	[[0, 0, 0, 0, 0], [6, 7, 8, 9, 10], [-10, -9, -8, -7, -6]],
	[[0, 0, 0, 0, 0], [11, 12, 13, 14, 15], [-15, -14, -13, -12, -11]]
	], dtype=np.float32)

	seq_sizes = np.asarray([1, 3, 3], dtype=np.int32)

	labels = np.asarray([[1, 0], [3, 3], [2, 3]], dtype=np.int32)

	label_sizes = np.asarray([1, 2, 2], dtype=np.int32)

	expected_losses = np.asarray([1.609437943, 7.355742931, 4.938849926],
	dtype=np.float32)

	blank = 0

	expected_grad = np.asarray([
	[[0.2, -0.8, 0.2, 0.2, 0.2],
	[ 0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
	[-0.02115798369, 0.03168492019, -0.8810571432, 0.2341216654, 0.636408627]],
	[[0, 0, 0, 0, 0],
	[-0.9883437753, 0.03168492019, 0.08612854034, 0.2341216654, 0.636408627],
	[-0.02115798369, 0.03168492019, -0.1891518533, -0.4577836394, 0.636408627]],
	[[0, 0, 0, 0, 0],
	[0.01165623125, 0.03168492019, 0.08612854034, -0.7658783197, 0.636408627],
	[-0.02115798369, 0.03168492019, 0.08612854034, -0.7330639958, 0.636408627]]
	], dtype=np.float32)

	linear_out_var = T.as_tensor_variable(linear_out)
	losses = ctc_loss(
	linear_out_var, seq_sizes, labels, label_sizes, blank)

	assert np.allclose(losses.eval(), expected_losses, atol=1)

	grad = theano.grad(losses.sum(), wrt=linear_out_var)

	assert np.allclose(grad.eval(), expected_grad, rtol=.001, atol=1)

	def test_random(self):
	batch_size = 16
	label_size = 5
	voca_size = 4
	seq_size = 20

	label_sizes = np.random.randint(
	0, label_size, size=(batch_size,), dtype=np.int32)
	label_sizes[0] = label_size
	label_sizes[1] = 0
	label_sizes[2] = 5
	label_sizes[3] = 5

	labels = np.random.randint(
	0, voca_size - 1,
	size=(batch_size, label_size), dtype=np.int32)
	labels[3] = 0

	seq_sizes = np.array([
	np.random.randint(max(1, label_sizes[i]), seq_size)
	for i in range(batch_size)], dtype=np.int32)
	seq_sizes[2] = 4

	linear_out = np.random.randn(
	seq_size, batch_size, voca_size).astype(np.float32)

	# check edge cases
	# TODO

	# check the gradient can be computed at all
	linear_out_var = T.tensor3()
	preds = T.nnet.softmax(
	linear_out_var.reshape((-1, voca_size))
	).reshape((seq_size, batch_size, voca_size))

	g = theano.grad(ctc_loss(preds, seq_sizes,
	labels, label_sizes).sum(),
	wrt=linear_out_var).eval(
	{linear_out_var: linear_out.astype(np.float32)})
	assert not np.any(np.isnan(g))

	# check correctness against finite difference approximation
	def f(linear_out_):
	preds_ = T.nnet.softmax(
	linear_out_.reshape((-1, voca_size))
	).reshape((seq_size, batch_size, voca_size))
	loss = ctc_loss(preds_, seq_sizes, labels, label_sizes)
	# prevent finite differences from failing
	loss = T.switch(isneginf(-loss), 0, loss)
	return loss

	unittest_tools.verify_grad(f, [linear_out], abs_tol=0.05, rel_tol=0.05)