f0k/pcenlayer.py

## pcenlayer.py
# -*- coding: utf-8 -*-

"""
PCEN Layer for Lasagne.

Author: Jan Schlüter, OFAI
"""

import numpy as np
import theano
import theano.tensor as T
import lasagne


class PCENLayer(lasagne.layers.Layer):
    """
    Trainable PCEN (Per-Channel Energy Normalization) layer:

    .. math::
        Y = (\\frac{X}{(\\epsilon + M)^\\alpha} + \\delta)^r - \\delta^r

        M_t = (1 - s) M_{t - 1} + s X_t

    Assumes spectrogram input of shape ``(batchsize, channels, time, bands)``.
    Implements an automatic gain control through the division by :math:`M`, an
    IIR filter estimating the local magnitude, followed by root compression.
    As proposed in [1]_, all parameters are trainable, and learned separately
    per frequency band. In contrast to the paper, the smoother :math:`M` is
    learned by backpropagating through the recurrence relation to tune
    :math:`s`, not by mixing a set of predefined smoothers.

    Parameters
    ----------
    incoming : a :class:`Layer` instance or a tuple
        The layer feeding into this layer, or the expected input shape. Expects
        a 4-dimensional input shape of ``(batchsize, channels, time, bands)``.
    log_s : Theano shared variable, expression, numpy array, callable or None
        Initial value, expression or initializer for the logarithm of :math:`s`
        per frequency band (i.e., a parameter vector).
    log_alpha : Theano shared variable, expression, numpy array, callable or None
        Initial value, expression or initializer for the logarithm of
        :math:`\\alpha` per frequency band (i.e., a parameter vector).
    log_delta : Theano shared variable, expression, numpy array, callable or None
        Initial value, expression or initializer for the logarithm of
        :math:`\\delta` per frequency band (i.e., a parameter vector).
    log_r : Theano shared variable, expression, numpy array, callable or None
        Initial value, expression or initializer for the logarithm of
        :math:`r` per frequency band (i.e., a parameter vector).
    eps : scalar
        Small constant :math:`\\epsilon` added to the smoother :math:`M` before
        dividing by it, to avoid numerical problems.
    init_smoother_from_data : bool (default: True)
        Whether to initialize the smoother :math:`M` from the first input frame
        (the default, recommended) rather than from zero.
    **kwargs
        Any additional keyword arguments are passed to the :class:`Layer`
        superclass.

    Notes
    -----
    To enforce full independence of input scale, fix :math:`\\alpha = 1` by
    passing ``log_alpha=theano.tensor.constant(0).dimshuffle('x')``.

    References
    ----------
    .. [1] Y. Wang, P. Getreuer, T. Hughes, R. F. Lyon, R. A. Saurous (2016):
           Trainable Frontend For Robust and Far-Field Keyword Spotting.
           https://arxiv.org/abs/1607.05666.
    """
    def __init__(self, incoming,
                 log_s=lasagne.init.Constant(np.log(0.025)),
                 log_alpha=lasagne.init.Constant(0),
                 log_delta=lasagne.init.Constant(0),
                 log_r=lasagne.init.Constant(0),
                 eps=1e-6, init_smoother_from_data=True, **kwargs):
        super(PCENLayer, self).__init__(incoming, **kwargs)
        num_bands = self.input_shape[-1]
        self.log_s = self.add_param(log_s, shape=(num_bands,),
                                    name='log_s', regularizable=False)
        self.log_alpha = self.add_param(log_alpha, shape=(num_bands,),
                                        name='log_alpha', regularizable=False)
        self.log_delta = self.add_param(log_delta, shape=(num_bands,),
                                        name='log_delta', regularizable=False)
        self.log_r = self.add_param(log_r, shape=(num_bands,),
                                    name='log_r', regularizable=False)
        self.eps = eps
        self.init_smoother_from_data = init_smoother_from_data

    def get_output_for(self, input, **kwargs):
        def smooth_step(current_in, previous_out, s):
            one = T.constant(1)
            return [(one - s) * previous_out + s * current_in]
        init = input[:, :, 0]  # initialize the filter with the first frame
        if not self.init_smoother_from_data:
            init = T.zeros_like(init)  # initialize with zeros instead
        s = T.exp(self.log_s).dimshuffle('x', 'x', 0)
        smoother = theano.scan(fn=smooth_step,
                               sequences=[input.transpose(2, 0, 1, 3)],
                               non_sequences=[s],
                               outputs_info=[init],
                               strict=True)[0].transpose(1, 2, 0, 3)
        alpha = T.exp(self.log_alpha)
        delta = T.exp(self.log_delta)
        r = T.exp(self.log_r)
        # stable reformulation due to Vincent Lostanlen; original formula was:
        # return (input / (self.eps + smoother)**alpha + delta)**r - delta**r
        smoother = T.exp(-alpha * (T.log(self.eps) +
                                   T.log1p(smoother / self.eps)))
        return (input * smoother + delta)**r - delta**r
	# -- coding: utf-8 --

	"""
	PCEN Layer for Lasagne.

	Author: Jan Schlüter, OFAI
	"""

	import numpy as np
	import theano
	import theano.tensor as T
	import lasagne


	class PCENLayer(lasagne.layers.Layer):
	"""
	Trainable PCEN (Per-Channel Energy Normalization) layer:

	.. math::
	Y = (\\frac{X}{(\\epsilon + M)^\\alpha} + \\delta)^r - \\delta^r

	M_t = (1 - s) M_{t - 1} + s X_t

	Assumes spectrogram input of shape ``(batchsize, channels, time, bands)``.
	Implements an automatic gain control through the division by :math:`M`, an
	IIR filter estimating the local magnitude, followed by root compression.
	As proposed in [1]_, all parameters are trainable, and learned separately
	per frequency band. In contrast to the paper, the smoother :math:`M` is
	learned by backpropagating through the recurrence relation to tune
	:math:`s`, not by mixing a set of predefined smoothers.

	Parameters
	----------
	incoming : a :class:`Layer` instance or a tuple
	The layer feeding into this layer, or the expected input shape. Expects
	a 4-dimensional input shape of ``(batchsize, channels, time, bands)``.
	log_s : Theano shared variable, expression, numpy array, callable or None
	Initial value, expression or initializer for the logarithm of :math:`s`
	per frequency band (i.e., a parameter vector).
	log_alpha : Theano shared variable, expression, numpy array, callable or None
	Initial value, expression or initializer for the logarithm of
	:math:`\\alpha` per frequency band (i.e., a parameter vector).
	log_delta : Theano shared variable, expression, numpy array, callable or None
	Initial value, expression or initializer for the logarithm of
	:math:`\\delta` per frequency band (i.e., a parameter vector).
	log_r : Theano shared variable, expression, numpy array, callable or None
	Initial value, expression or initializer for the logarithm of
	:math:`r` per frequency band (i.e., a parameter vector).
	eps : scalar
	Small constant :math:`\\epsilon` added to the smoother :math:`M` before
	dividing by it, to avoid numerical problems.
	init_smoother_from_data : bool (default: True)
	Whether to initialize the smoother :math:`M` from the first input frame
	(the default, recommended) rather than from zero.
	**kwargs
	Any additional keyword arguments are passed to the :class:`Layer`
	superclass.

	Notes
	-----
	To enforce full independence of input scale, fix :math:`\\alpha = 1` by
	passing ``log_alpha=theano.tensor.constant(0).dimshuffle('x')``.

	References
	----------
	.. [1] Y. Wang, P. Getreuer, T. Hughes, R. F. Lyon, R. A. Saurous (2016):
	Trainable Frontend For Robust and Far-Field Keyword Spotting.
	https://arxiv.org/abs/1607.05666.
	"""
	def __init__(self, incoming,
	log_s=lasagne.init.Constant(np.log(0.025)),
	log_alpha=lasagne.init.Constant(0),
	log_delta=lasagne.init.Constant(0),
	log_r=lasagne.init.Constant(0),
	eps=1e-6, init_smoother_from_data=True, **kwargs):
	super(PCENLayer, self).__init__(incoming, **kwargs)
	num_bands = self.input_shape[-1]
	self.log_s = self.add_param(log_s, shape=(num_bands,),
	name='log_s', regularizable=False)
	self.log_alpha = self.add_param(log_alpha, shape=(num_bands,),
	name='log_alpha', regularizable=False)
	self.log_delta = self.add_param(log_delta, shape=(num_bands,),
	name='log_delta', regularizable=False)
	self.log_r = self.add_param(log_r, shape=(num_bands,),
	name='log_r', regularizable=False)
	self.eps = eps
	self.init_smoother_from_data = init_smoother_from_data

	def get_output_for(self, input, **kwargs):
	def smooth_step(current_in, previous_out, s):
	one = T.constant(1)
	return [(one - s) * previous_out + s * current_in]
	init = input[:, :, 0] # initialize the filter with the first frame
	if not self.init_smoother_from_data:
	init = T.zeros_like(init) # initialize with zeros instead
	s = T.exp(self.log_s).dimshuffle('x', 'x', 0)
	smoother = theano.scan(fn=smooth_step,
	sequences=[input.transpose(2, 0, 1, 3)],
	non_sequences=[s],
	outputs_info=[init],
	strict=True)[0].transpose(1, 2, 0, 3)
	alpha = T.exp(self.log_alpha)
	delta = T.exp(self.log_delta)
	r = T.exp(self.log_r)
	# stable reformulation due to Vincent Lostanlen; original formula was:
	# return (input / (self.eps + smoother)alpha + delta)r - delta**r
	smoother = T.exp(-alpha * (T.log(self.eps) +
	T.log1p(smoother / self.eps)))
	return (input * smoother + delta)r - deltar