sisp/gist:7100561

## gistfile1.py
import numpy as np
import theano
import theano.tensor as T


floatX = theano.config.floatX


class GaussNewtonMatrix(object):
    def __init__(self, s):
        # `s` is the linear network outputs, i.e. the network output
        # without having applied the activation function
        self._s = s

    def __call__(self, v, cost, parameters, damp):
        # compute Gauss-Newton Matrix right-multiplied by `v`
        Jv   = T.Rop(self._s, parameters, v)
        HJv  = T.grad(T.sum(T.grad(cost, self._s) * Jv), self._s, consider_constant=[Jv])
        JHJv = T.grad(T.sum(HJv * self._s), parameters, consider_constant=[HJv, Jv])

        # apply Tikhonov damping
        JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)]
        return JHJv


def run(num_features, num_timesteps, batch_size=1):
    # determine shapes of inputs and targets depending on the batch size
    if batch_size == 1:
        inputs_size = (num_timesteps, num_features)
        targets_size = (num_timesteps, 1)
    else:
        inputs_size = (num_timesteps, batch_size, num_features)
        targets_size = (num_timesteps, batch_size, 1)

    # make inputs and targets shared variables
    inputs = theano.shared(np.random.uniform(size=inputs_size).astype(floatX), borrow=True)
    targets = theano.shared(np.random.uniform(size=targets_size).astype(floatX), borrow=True)

    # create symbolic inputs and targets variables
    x = T.matrix('inputs') if batch_size == 1 else T.tensor3('inputs')
    t = T.matrix('targets') if batch_size == 1 else T.tensor3('inputs')

    # create a set of parameters for a simple RNN
    W_xh = theano.shared(0.01 * np.random.uniform(size=(num_features, 10)).astype(floatX), borrow=True)
    W_hh = theano.shared(0.01 * np.random.uniform(size=(10, 10)).astype(floatX), borrow=True)
    W_hy = theano.shared(0.01 * np.random.uniform(size=(10, 1)).astype(floatX), borrow=True)
    b_h = theano.shared(np.zeros(10).astype(floatX), borrow=True)
    b_y = theano.shared(np.zeros(1).astype(floatX), borrow=True)

    params = [W_xh, W_hh, W_hy, b_h, b_y]

    # recurrent function
    def step(x_t, h_tm1):
        h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h)
        return h

    # build recurrent graph
    h_0 = T.alloc(0.0, 10) if batch_size == 1 else T.alloc(0.0, batch_size, 10)
    h, updates = theano.scan(step,
                             sequences=[x],
                             outputs_info=[h_0])
    # network output
    y = T.dot(h, W_hy) + b_y

    # Create Gauss-Newton-Matrix object. Not really of any use here, but I
    # need it for Hessian-Free optimization.
    gn = GaussNewtonMatrix(y)

    # compute MSE
    cost = ((t - y)**2).sum(axis=1).mean()
    # Compute the cost at some other point in the parameter space. Not really
    # of any use here, but this is how I do it during certain iterations of CG
    # in the HF algorithm. There, it's in fact `pi + current update proposal`.
    # For simplicity, I just multiply by 2 here.
    # ! NOTE: If you comment out the next line and remove the function output
    # in line 85, it works for both cases. !
    cost_ = theano.clone(cost, replace=dict([(pi, 2*pi) for pi in params]))

    # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG,
    # but for simplicity, I just take the parameters vector because it's
    # already there.
    Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0))

    # compile Theano function
    f = theano.function([], [cost_] + Gv, givens={x: inputs, t: targets})
    # execute
    f()


if __name__ == '__main__':
    # This runs fine. The batch size is set to something greater than 1, i.e.
    # the data is represented by a tensor3 object.
    run(100, 10, batch_size=5)
    # This gives an error:
    # ERROR (theano.gof.opt): Optimization failure due to: remove_constants_and_unused_inputs_scan
    # The batch size is set to 1 and the data is represented by a matrix object.
    run(100, 10, batch_size=1)
	import numpy as np
	import theano
	import theano.tensor as T


	floatX = theano.config.floatX


	class GaussNewtonMatrix(object):
	def __init__(self, s):
	# `s` is the linear network outputs, i.e. the network output
	# without having applied the activation function
	self._s = s

	def __call__(self, v, cost, parameters, damp):
	# compute Gauss-Newton Matrix right-multiplied by `v`
	Jv = T.Rop(self._s, parameters, v)
	HJv = T.grad(T.sum(T.grad(cost, self._s) * Jv), self._s, consider_constant=[Jv])
	JHJv = T.grad(T.sum(HJv * self._s), parameters, consider_constant=[HJv, Jv])

	# apply Tikhonov damping
	JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)]
	return JHJv


	def run(num_features, num_timesteps, batch_size=1):
	# determine shapes of inputs and targets depending on the batch size
	if batch_size == 1:
	inputs_size = (num_timesteps, num_features)
	targets_size = (num_timesteps, 1)
	else:
	inputs_size = (num_timesteps, batch_size, num_features)
	targets_size = (num_timesteps, batch_size, 1)

	# make inputs and targets shared variables
	inputs = theano.shared(np.random.uniform(size=inputs_size).astype(floatX), borrow=True)
	targets = theano.shared(np.random.uniform(size=targets_size).astype(floatX), borrow=True)

	# create symbolic inputs and targets variables
	x = T.matrix('inputs') if batch_size == 1 else T.tensor3('inputs')
	t = T.matrix('targets') if batch_size == 1 else T.tensor3('inputs')

	# create a set of parameters for a simple RNN
	W_xh = theano.shared(0.01 * np.random.uniform(size=(num_features, 10)).astype(floatX), borrow=True)
	W_hh = theano.shared(0.01 * np.random.uniform(size=(10, 10)).astype(floatX), borrow=True)
	W_hy = theano.shared(0.01 * np.random.uniform(size=(10, 1)).astype(floatX), borrow=True)
	b_h = theano.shared(np.zeros(10).astype(floatX), borrow=True)
	b_y = theano.shared(np.zeros(1).astype(floatX), borrow=True)

	params = [W_xh, W_hh, W_hy, b_h, b_y]

	# recurrent function
	def step(x_t, h_tm1):
	h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h)
	return h

	# build recurrent graph
	h_0 = T.alloc(0.0, 10) if batch_size == 1 else T.alloc(0.0, batch_size, 10)
	h, updates = theano.scan(step,
	sequences=[x],
	outputs_info=[h_0])
	# network output
	y = T.dot(h, W_hy) + b_y

	# Create Gauss-Newton-Matrix object. Not really of any use here, but I
	# need it for Hessian-Free optimization.
	gn = GaussNewtonMatrix(y)

	# compute MSE
	cost = ((t - y)**2).sum(axis=1).mean()
	# Compute the cost at some other point in the parameter space. Not really
	# of any use here, but this is how I do it during certain iterations of CG
	# in the HF algorithm. There, it's in fact `pi + current update proposal`.
	# For simplicity, I just multiply by 2 here.
	# ! NOTE: If you comment out the next line and remove the function output
	# in line 85, it works for both cases. !
	cost_ = theano.clone(cost, replace=dict([(pi, 2*pi) for pi in params]))

	# Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG,
	# but for simplicity, I just take the parameters vector because it's
	# already there.
	Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0))

	# compile Theano function
	f = theano.function([], [cost_] + Gv, givens={x: inputs, t: targets})
	# execute
	f()


	if __name__ == '__main__':
	# This runs fine. The batch size is set to something greater than 1, i.e.
	# the data is represented by a tensor3 object.
	run(100, 10, batch_size=5)
	# This gives an error:
	# ERROR (theano.gof.opt): Optimization failure due to: remove_constants_and_unused_inputs_scan
	# The batch size is set to 1 and the data is represented by a matrix object.
	run(100, 10, batch_size=1)