takatakamanbou/00_nnet.md Secret

## 00_nnet.md

      
    Raw
  

              00_nnet.md
            
          
    nnet.py


nnet151219.py & nnet151219bn.py

see http://takatakamanbou.hatenablog.com/entry/2015/12/20/233232
nnet151219.py: restructured from nnet150903.py
nnet151219bn.py:  with Batch Normalization


nnet150903.py

see http://takatakamanbou.hatenablog.com/entry/2015/09/07/171648


older version is found in https://gist.github.com/takatakamanbou/01ed577cd35a9890dbe9#file-nnet150712-py


## nnet150903.py
import numpy as np
import theano
import theano.tensor as T

# activation functions
d_afunc = { 'linear': lambda Y: Y,
            'sigmoid': T.nnet.sigmoid,
            'ReLu': lambda Y: T.switch( Y > 0, Y, 0 ) }

def randomstreams( seed ):

    return T.shared_randomstreams.RandomStreams( seed = seed )


########## input layer ##########

class InputLayer( object ):

    def __init__( self, D, dropout = 1.0 ):

        self.Din = D
        self.Nunit = D
        self.dropout = dropout


    def Top_output( self, X ):

        if self.dropout < 1.0:
            return X * self.dropout
        else:
            return X


    def Top_generateMask( self, rng ):

        return rng.uniform( ( self.Nunit, ) ) <= self.dropout


    def Top_outputMasked( self, X, mask ):

        return X * mask


########## hidden layers ##########

class Layer( object ):

    def __init__( self, Din, Nunit, afunc, withBias = True, Wini = 0.01,
                  dropout = 1.0 ):

        self.Din = Din
        self.Nunit = Nunit
        self.afunc = afunc
        self.withBias = withBias
        self.dropout = dropout

        # making theano shared variables for weights & biases
        floatX = theano.config.floatX
        W = Wini * np.random.standard_normal( ( Nunit, Din ) )
        self.W  = theano.shared( np.asarray( W, dtype = floatX ) )
        self.dW = theano.shared( np.zeros( ( Nunit, Din ), dtype = floatX ) )
        if self.withBias:
            self.b  = theano.shared( np.zeros( Nunit, dtype = floatX ) )
            self.db = theano.shared( np.zeros( Nunit, dtype = floatX ) )

        # theano functions
        self.setWeight         = self.Tfunc_setWeight()


    def Tfunc_setWeight( self ):

        W = T.matrix()
        if self.withBias:
            b = T.vector()
            inList = [ W, b ]
            upList = [ ( self.W, W ), ( self.b, b ) ]
        else:
            inList = [ W ]
            upList = [ ( self.W, W ) ]

        return theano.function( inList, None, updates = upList )


    def getWeight( self ):

        W = self.W.get_value()
        if self.withBias:
            b = self.b.get_value()
            return [ W, b ]
        else:
            return W


    def Top_outputRaw( self, X ):

        Y = T.dot( X, self.W.T )
        if self.withBias:
            Y += self.b
        Z = d_afunc[self.afunc]( Y )

        return Y, Z


    def Top_output( self, X ):

        Y, Z = self.Top_outputRaw( X )
        if self.dropout < 1.0:
            Z *= self.dropout

        return Y, Z


    def Top_generateMask( self, rng ):

        return rng.uniform( ( self.Nunit, ) ) <= self.dropout


    def Top_outputMasked( self, X, mask ):

        Y, Z = self.Top_outputRaw( X )

        return Y, Z * mask


    def T_update( self, cost, eta, mu, lam ):

        gradW = T.grad( cost, self.W )
        dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
        Wnew  = self.W + dWnew
        upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]
        if self.withBias:
            gradb = T.grad( cost, self.b )
            # no weight decay for bias
            dbnew = -eta * gradb + mu * self.db
            bnew  = self.b + dbnew
            upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

        return upList


    def T_updateMasked( self, cost, eta, mu, lam, mask ):

        M = T.shape_padright( mask )
        gradW = T.grad( cost, self.W )
        #dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
        dWnewOn = -eta * ( gradW + lam * self.W ) + mu * self.dW
        dWnew = T.switch( M, dWnewOn, self.dW )
        Wnew = T.switch( M, self.W + dWnew, self.W )
        upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]

        if self.withBias:
            gradb = T.grad( cost, self.b )
            # no weight decay for bias
            #dbnew = -eta * gradb + mu * self.db
            dbnewOn = -eta * gradb + mu * self.db
            dbnew = T.switch( mask, dbnewOn, self.db )
            #bnew  = self.b + dbnew
            bnew = T.switch( mask, self.b + dbnew, self.b )
            upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

        return upList


    def T_updateMasked2( self, cost, eta, mu, lam, maskI, maskO ):

        M = T.outer( maskO, maskI )
        gradW = T.grad( cost, self.W )
        #dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
        dWnewOn = -eta * ( gradW + lam * self.W ) + mu * self.dW
        dWnew = T.switch( M, dWnewOn, self.dW )
        Wnew = T.switch( M, self.W + dWnew, self.W )
        upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]

        if self.withBias:
            gradb = T.grad( cost, self.b )
            # no weight decay for bias
            #dbnew = -eta * gradb + mu * self.db
            dbnewOn = -eta * gradb + mu * self.db
            dbnew = T.switch( maskO, dbnewOn, self.db )
            #bnew  = self.b + dbnew
            bnew = T.switch( maskO, self.b + dbnew, self.b )
            upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

        return upList


########## MLP ##########

class MLP( object ):

    def __init__( self, Layers, rng = None ):

        floatX = theano.config.floatX

        # layers - list of Layer instances
        self.Layers = Layers
        assert isinstance( Layers[0], InputLayer )
        dropout = np.empty( len( Layers ) )
        for i in range( len( dropout ) ):
            dropout[i] = Layers[i].dropout
        self.withDropout = np.prod( dropout ) < 1.0

        # random number generator
        if rng == None:
            self.rng = randomstreams( 0 )
        else:
            self.rng = rng

        # theano functions
        self.output = self.Tfunc_output()
        self.cost   = self.Tfunc_cost()
        self.train  = self.Tfunc_train()


    # theano op for output computation ( for test )
    def Top_output( self, X ):

        # input layer
        layer = self.Layers[0]
        Zprev = layer.Top_output( X )

        # hidden layers
        for layer in self.Layers[1:]:
            Y, Z = layer.Top_output( Zprev )
            Zprev = Z

        # output
        Zsoftmax = T.nnet.softmax( Zprev )

        return Zsoftmax


    # theano function for output computation ( for test )
    def Tfunc_output( self ):

        X = T.matrix()  # N x D
        Z = self.Top_output( X )

        return theano.function( [ X ] , Z )


    # theano op for cost computation ( error term )
    def Top_cost( self, Z, lab ):

        cost = T.nnet.categorical_crossentropy( Z, lab )

        return T.mean( cost )


    # theano function for cost computation
    def Tfunc_cost( self ):

        Z = T.matrix() # N x K
        lab = T.ivector() # N-dim
        return theano.function( [ Z, lab ], self.Top_cost( Z, lab ) )


    # theano function for gradient descent learning
    def Tfunc_train( self ):

        X    = T.matrix( 'X' )  # N x D
        lab  = T.ivector( 'lab' )  # N-dim
        eta  = T.scalar( 'eta' )
        mu   = T.scalar( 'mu' )
        lam  = T.scalar( 'lambda' )

        '''
        if self.withDropout:
            maskList = []
        '''

        # input layer
        layer = self.Layers[0]
        if self.withDropout:
            mask = layer.Top_generateMask( self.rng )
            Zprev = layer.Top_outputMasked( X, mask )
            #maskList.append( mask )
        else:
            Zprev = layer.Top_output( X )

        # hidden layers
        for layer in self.Layers[1:]:
            if self.withDropout:
                mask = layer.Top_generateMask( self.rng )
                Y, Z = layer.Top_outputMasked( Zprev, mask )
                #maskList.append( mask )
            else:
                Y, Z = layer.Top_output( Zprev )
            Zprev = Z

        # output & cost
        Z = T.nnet.softmax( Zprev )
        cost = self.Top_cost( Z, lab )

        # updatesList
        updatesList = []
        for i in range( len( self.Layers ) ):
            layer = self.Layers[i]
            if not isinstance( layer, InputLayer ):
                '''
                if self.withDropout:
                    maskI, maskO = maskList[i-1], maskList[i]
                    #updatesList += layer.T_updateMasked( cost, eta, mu, lam, maskO )
                    updatesList += layer.T_updateMasked2( cost, eta, mu, lam, maskI, maskO )

                else:
                    updatesList += layer.T_update( cost, eta, mu, lam )
                '''
                updatesList += layer.T_update( cost, eta, mu, lam )


        return theano.function( [ X, lab, eta, mu, lam ], [ Z, cost ], updates = updatesList )

## nnet151219.py
import numpy as np
import theano
import theano.tensor as T

# activation functions
d_afunc = { 'linear': lambda Y: Y,
            'sigmoid': T.nnet.sigmoid,
            'ReLu': lambda Y: T.switch( Y > 0, Y, 0 ) }

def randomstreams( seed ):

    return T.shared_randomstreams.RandomStreams( seed = seed )


########## input layer ##########

class InputLayer( object ):

    def __init__( self, D, rng = None, dropout = 1.0 ):

        self.Din = D
        self.Nunit = D
        self.dropout = dropout
        if rng == None:
            self.rng = randomstreams( 0 )
        else:
            self.rng = rng


    def Top_outputTrain( self, X ):

        if self.dropout < 1.0:
            mask = self.rng.uniform( ( self.Nunit, ) ) <= self.dropout
            return X * mask
        else:
            return X


    def Top_outputInference( self, X ):

        if self.dropout < 1.0:
            return X * self.dropout
        else:
            return X


########## hidden layers ##########

class Layer( object ):

    def __init__( self, Din, Nunit, afunc, rng = None, withBias = True, Wini = 0.01,
                  dropout = 1.0 ):

        self.Din = Din
        self.Nunit = Nunit
        self.afunc = afunc
        self.withBias = withBias
        self.dropout = dropout
        if rng == None:
            self.rng = randomstreams( 0 )
        else:
            self.rng = rng

        # making theano shared variables for weights & biases
        floatX = theano.config.floatX
        self.W  = theano.shared( np.zeros( ( Nunit, Din ), dtype = floatX ) )
        self.dW = theano.shared( np.zeros( ( Nunit, Din ), dtype = floatX ) )
        if self.withBias:
            self.b = theano.shared( np.zeros( Nunit, dtype = floatX ) )
            self.db = theano.shared( np.zeros( Nunit, dtype = floatX ) )

        # theano functions
        self.initWeight = self.Tfunc_initWeight()
        self.setWeight = self.Tfunc_setWeight()

        # weight initialization
        self.initWeight( Wini )


    def Tfunc_initWeight( self ):

        Wini = T.scalar()
        W = self.rng.normal( ( self.Nunit, self.Din ), avg = 0.0, std = Wini )
        inList = [ Wini ]
        upList = [ ( self.W, W ) ]

        return theano.function( inList, None, updates = upList )


    def Tfunc_setWeight( self ):

        W = T.matrix()
        if self.withBias:
            b = T.vector()
            inList = [ W, b ]
            upList = [ ( self.W, W ), ( self.b, b ) ]
        else:
            inList = [ W ]
            upList = [ ( self.W, W ) ]

        return theano.function( inList, None, updates = upList )


    def getWeight( self ):

        W = self.W.get_value()
        if self.withBias:
            b = self.b.get_value()
            return [ W, b ]
        else:
            return W


    def Top_outputRaw( self, X ):

        Y = T.dot( X, self.W.T )
        if self.withBias:
            Y += self.b
        Z = d_afunc[self.afunc]( Y )

        return Y, Z


    def Top_outputTrain( self, X ):

        Y, Z = self.Top_outputRaw( X )
        if self.dropout < 1.0:
            mask = self.rng.uniform( ( self.Nunit, ) ) <= self.dropout
            return Y, Z * mask
        else:
            return Y, Z


    def Top_outputInference( self, X ):

        Y, Z = self.Top_outputRaw( X )
        if self.dropout < 1.0:
            return Y, Z * self.dropout
        else:
            return Y, Z


    def T_update( self, cost, eta, mu, lam ):

        gradW = T.grad( cost, self.W )
        dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
        Wnew  = self.W + dWnew
        upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]
        if self.withBias:
            gradb = T.grad( cost, self.b )
            # no weight decay for bias
            dbnew = -eta * gradb + mu * self.db
            bnew  = self.b + dbnew
            upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

        return upList


########## MLP ##########

class MLP( object ):

    def __init__( self, Layers ):

        floatX = theano.config.floatX

        # layers - list of Layer instances
        self.Layers = Layers
        assert isinstance( Layers[0], InputLayer )

        # theano functions
        self.output = self.Tfunc_output()
        self.cost   = self.Tfunc_cost()
        self.train  = self.Tfunc_train()


    # theano op for output computation ( for inference )
    def Top_output( self, X ):

        # input layer
        layer = self.Layers[0]
        Zprev = layer.Top_outputInference( X )

        # hidden layers
        for layer in self.Layers[1:]:
            Y, Z = layer.Top_outputInference( Zprev )
            Zprev = Z

        # output
        Zsoftmax = T.nnet.softmax( Zprev )

        return Zsoftmax


    # theano function for output computation ( for inference )
    def Tfunc_output( self ):

        X = T.matrix()  # N x D
        Z = self.Top_output( X )

        return theano.function( [ X ] , Z )


    # theano op for cost computation ( error term )
    def Top_cost( self, Z, lab ):

        cost = T.nnet.categorical_crossentropy( Z, lab )

        return T.mean( cost )


    # theano function for cost computation
    def Tfunc_cost( self ):

        Z = T.matrix() # N x K
        lab = T.ivector() # N-dim
        return theano.function( [ Z, lab ], self.Top_cost( Z, lab ) )


    # theano function for gradient descent learning
    def Tfunc_train( self ):

        X    = T.matrix( 'X' )  # N x D
        lab  = T.ivector( 'lab' )  # N-dim
        eta  = T.scalar( 'eta' )
        mu   = T.scalar( 'mu' )
        lam  = T.scalar( 'lambda' )

        # input layer
        layer = self.Layers[0]
        Zprev = layer.Top_outputTrain( X )

        # hidden layers
        for layer in self.Layers[1:]:
            Y, Z = layer.Top_outputTrain( Zprev )
            Zprev = Z

        # output & cost
        Z = T.nnet.softmax( Zprev )
        cost = self.Top_cost( Z, lab )

        # updatesList
        updatesList = []
        for layer in self.Layers[1:]:
            updatesList += layer.T_update( cost, eta, mu, lam )


        return theano.function( [ X, lab, eta, mu, lam ], [ Z, cost ], updates = updatesList )


## nnet151219bn.py
import numpy as np
import theano
import theano.tensor as T

# activation functions
d_afunc = { 'linear': lambda Y: Y,
            'sigmoid': T.nnet.sigmoid,
            'ReLu': lambda Y: T.switch( Y > 0, Y, 0 ) }

def randomstreams( seed ):

    return T.shared_randomstreams.RandomStreams( seed = seed )


########## input layer ##########

class InputLayer( object ):

    def __init__( self, D, rng = None, dropout = 1.0 ):

        self.Din = D
        self.Nunit = D
        self.dropout = dropout
        if rng == None:
            self.rng = randomstreams( 0 )
        else:
            self.rng = rng


    def Top_outputTrain( self, X ):

        if self.dropout < 1.0:
            mask = self.rng.uniform( ( self.Nunit, ) ) <= self.dropout
            return X * mask
        else:
            return X


    def Top_outputInference( self, X ):

        if self.dropout < 1.0:
            return X * self.dropout
        else:
            return X


########## hidden layers ##########

class Layer( object ):

    def __init__( self, Din, Nunit, afunc, rng = None, Wini = 0.01,
                  dropout = 1.0 ):

        self.Din = Din
        self.Nunit = Nunit
        self.afunc = afunc
        self.dropout = dropout
        if rng == None:
            self.rng = randomstreams( 0 )
        else:
            self.rng = rng

        # making theano shared variables for weights & biases
        floatX = theano.config.floatX
        self.W  = theano.shared( np.zeros( ( Nunit, Din ), dtype = floatX ) )
        self.dW = theano.shared( np.zeros( ( Nunit, Din ), dtype = floatX ) )

        self.BNa  = theano.shared( np.ones( Nunit, dtype = floatX ) )
        self.dBNa = theano.shared( np.zeros( Nunit, dtype = floatX ) )
        self.BNb  = theano.shared( np.zeros( Nunit, dtype = floatX ) )
        self.dBNb = theano.shared( np.zeros( Nunit, dtype = floatX ) )

        self.BNmu = theano.shared( np.zeros( Nunit, dtype = floatX ) )
        self.BNsig2 = theano.shared( np.ones( Nunit, dtype = floatX ) )
        self.BNeps = 0.01

        # theano functions
        self.initWeight = self.Tfunc_initWeight()

        # weight initialization
        self.initWeight( Wini )


    def Tfunc_initWeight( self ):

        Wini = T.scalar( 'Wini' )
        W = self.rng.normal( ( self.Nunit, self.Din ), avg = 0.0, std = Wini )
        inList = [ Wini ]
        upList = [ ( self.W, W ) ]

        return theano.function( inList, None, updates = upList )


    def Top_outputTrain( self, X ):

        Y = T.dot( X, self.W.T )

        # Batch Normalization
        BNmu = T.mean( Y, axis = 0 )
        Y -= BNmu
        BNsig2 = T.mean( T.sqr( Y ), axis = 0 ) + self.BNeps
        Y /= T.sqrt( BNsig2 )

        # scaling & shifting
        Yt = self.BNa * Y + self.BNb

        Z = d_afunc[self.afunc]( Yt )

        if self.dropout < 1.0:
            mask = self.rng.uniform( ( self.Nunit, ) ) <= self.dropout
            Z *= mask

        return Yt, Z, BNmu, BNsig2


    def Top_outputInference( self, X ):

        Y = T.dot( X, self.W.T )
        Yt = self.BNa * ( Y - self.BNmu ) / T.sqrt( self.BNsig2 ) + self.BNb
        Z = d_afunc[self.afunc]( Yt )

        if self.dropout < 1.0:
            Z *= self.dropout

        return Yt, Z


    def T_update( self, cost, eta, mu, lam ):

        gradW, gradBNa, gradBNb = T.grad( cost, [ self.W, self.BNa, self.BNb ] )

        dWnew    = -eta * ( gradW + lam * self.W ) + mu * self.dW
        dBNa_new = -eta * gradBNa + mu * self.dBNa
        dBNb_new = -eta * gradBNb + mu * self.dBNb
        Wnew     = self.W + dWnew
        BNa_new  = self.BNa + dBNa_new
        BNb_new  = self.BNb + dBNb_new

        upList = [
            ( self.W, Wnew ), ( self.dW, dWnew ),
            ( self.BNa, BNa_new ), ( self.dBNa, dBNa_new ),
            ( self.BNb, BNb_new ), ( self.dBNb, dBNb_new )
        ]

        return upList


########## MLP ##########

class MLP( object ):

    def __init__( self, Layers ):

        floatX = theano.config.floatX

        # layers - list of Layer instances
        self.Layers = Layers
        self.nlayer = len( Layers )
        assert isinstance( Layers[0], InputLayer )

        # theano functions
        self.output = self.Tfunc_output()
        self.cost   = self.Tfunc_cost()
        self.train  = self.Tfunc_train()


    # theano op for output computation ( for inference )
    def Top_output( self, X ):

        # input layer
        layer = self.Layers[0]
        Zprev = layer.Top_outputInference( X )

        # hidden layers
        for layer in self.Layers[1:]:
            Y, Z = layer.Top_outputInference( Zprev )
            Zprev = Z

        # output
        Zsoftmax = T.nnet.softmax( Zprev )

        return Zsoftmax


    # theano function for output computation ( for inference )
    def Tfunc_output( self ):

        X = T.matrix()  # N x D
        Z = self.Top_output( X )

        return theano.function( [ X ] , Z )


    # theano op for cost computation ( error term )
    def Top_cost( self, Z, lab ):

        cost = T.nnet.categorical_crossentropy( Z, lab )

        return T.mean( cost )


    # theano function for cost computation
    def Tfunc_cost( self ):

        Z = T.matrix() # N x K
        lab = T.ivector() # N-dim
        return theano.function( [ Z, lab ], self.Top_cost( Z, lab ) )


    # theano function for gradient descent learning
    def Tfunc_train( self ):

        X    = T.matrix( 'X' )  # N x D
        lab  = T.ivector( 'lab' )  # N-dim
        eta  = T.scalar( 'eta' )
        mu   = T.scalar( 'mu' )
        lam  = T.scalar( 'lambda' )

        # input layer
        layer = self.Layers[0]
        Zprev = layer.Top_outputTrain( X )

        # hidden layers
        updatesList = []
        for layer in self.Layers[1:]:
            Y, Z, BNmu, BNsig2 = layer.Top_outputTrain( Zprev )
            updatesList += [ ( layer.BNmu, BNmu ), ( layer.BNsig2, BNsig2 ) ]
            Zprev = Z

        # output & cost
        Z = T.nnet.softmax( Zprev )
        cost = self.Top_cost( Z, lab )

        # updatesList
        for layer in self.Layers[1:]:
            updatesList += layer.T_update( cost, eta, mu, lam )


        return theano.function( [ X, lab, eta, mu, lam ], [ Z, cost ], updates = updatesList )
	import numpy as np
	import theano
	import theano.tensor as T

	# activation functions
	d_afunc = { 'linear': lambda Y: Y,
	'sigmoid': T.nnet.sigmoid,
	'ReLu': lambda Y: T.switch( Y > 0, Y, 0 ) }

	def randomstreams( seed ):

	return T.shared_randomstreams.RandomStreams( seed = seed )


	########## input layer ##########

	class InputLayer( object ):

	def __init__( self, D, dropout = 1.0 ):

	self.Din = D
	self.Nunit = D
	self.dropout = dropout


	def Top_output( self, X ):

	if self.dropout < 1.0:
	return X * self.dropout
	else:
	return X


	def Top_generateMask( self, rng ):

	return rng.uniform( ( self.Nunit, ) ) <= self.dropout


	def Top_outputMasked( self, X, mask ):

	return X * mask



	########## hidden layers ##########

	class Layer( object ):

	def __init__( self, Din, Nunit, afunc, withBias = True, Wini = 0.01,
	dropout = 1.0 ):

	self.Din = Din
	self.Nunit = Nunit
	self.afunc = afunc
	self.withBias = withBias
	self.dropout = dropout

	# making theano shared variables for weights & biases
	floatX = theano.config.floatX
	W = Wini * np.random.standard_normal( ( Nunit, Din ) )
	self.W = theano.shared( np.asarray( W, dtype = floatX ) )
	self.dW = theano.shared( np.zeros( ( Nunit, Din ), dtype = floatX ) )
	if self.withBias:
	self.b = theano.shared( np.zeros( Nunit, dtype = floatX ) )
	self.db = theano.shared( np.zeros( Nunit, dtype = floatX ) )

	# theano functions
	self.setWeight = self.Tfunc_setWeight()


	def Tfunc_setWeight( self ):

	W = T.matrix()
	if self.withBias:
	b = T.vector()
	inList = [ W, b ]
	upList = [ ( self.W, W ), ( self.b, b ) ]
	else:
	inList = [ W ]
	upList = [ ( self.W, W ) ]

	return theano.function( inList, None, updates = upList )


	def getWeight( self ):

	W = self.W.get_value()
	if self.withBias:
	b = self.b.get_value()
	return [ W, b ]
	else:
	return W


	def Top_outputRaw( self, X ):

	Y = T.dot( X, self.W.T )
	if self.withBias:
	Y += self.b
	Z = d_afunc[self.afunc]( Y )

	return Y, Z


	def Top_output( self, X ):

	Y, Z = self.Top_outputRaw( X )
	if self.dropout < 1.0:
	Z *= self.dropout

	return Y, Z


	def Top_generateMask( self, rng ):

	return rng.uniform( ( self.Nunit, ) ) <= self.dropout


	def Top_outputMasked( self, X, mask ):

	Y, Z = self.Top_outputRaw( X )

	return Y, Z * mask


	def T_update( self, cost, eta, mu, lam ):

	gradW = T.grad( cost, self.W )
	dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
	Wnew = self.W + dWnew
	upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]
	if self.withBias:
	gradb = T.grad( cost, self.b )
	# no weight decay for bias
	dbnew = -eta * gradb + mu * self.db
	bnew = self.b + dbnew
	upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

	return upList


	def T_updateMasked( self, cost, eta, mu, lam, mask ):

	M = T.shape_padright( mask )
	gradW = T.grad( cost, self.W )
	#dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
	dWnewOn = -eta * ( gradW + lam * self.W ) + mu * self.dW
	dWnew = T.switch( M, dWnewOn, self.dW )
	Wnew = T.switch( M, self.W + dWnew, self.W )
	upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]

	if self.withBias:
	gradb = T.grad( cost, self.b )
	# no weight decay for bias
	#dbnew = -eta * gradb + mu * self.db
	dbnewOn = -eta * gradb + mu * self.db
	dbnew = T.switch( mask, dbnewOn, self.db )
	#bnew = self.b + dbnew
	bnew = T.switch( mask, self.b + dbnew, self.b )
	upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

	return upList


	def T_updateMasked2( self, cost, eta, mu, lam, maskI, maskO ):

	M = T.outer( maskO, maskI )
	gradW = T.grad( cost, self.W )
	#dWnew = -eta * ( gradW + lam * self.W ) + mu * self.dW
	dWnewOn = -eta * ( gradW + lam * self.W ) + mu * self.dW
	dWnew = T.switch( M, dWnewOn, self.dW )
	Wnew = T.switch( M, self.W + dWnew, self.W )
	upList = [ ( self.W, Wnew ), ( self.dW, dWnew ) ]

	if self.withBias:
	gradb = T.grad( cost, self.b )
	# no weight decay for bias
	#dbnew = -eta * gradb + mu * self.db
	dbnewOn = -eta * gradb + mu * self.db
	dbnew = T.switch( maskO, dbnewOn, self.db )
	#bnew = self.b + dbnew
	bnew = T.switch( maskO, self.b + dbnew, self.b )
	upList += [ ( self.b, bnew ), ( self.db, dbnew ) ]

	return upList


	########## MLP ##########

	class MLP( object ):

	def __init__( self, Layers, rng = None ):

	floatX = theano.config.floatX

	# layers - list of Layer instances
	self.Layers = Layers
	assert isinstance( Layers[0], InputLayer )
	dropout = np.empty( len( Layers ) )
	for i in range( len( dropout ) ):
	dropout[i] = Layers[i].dropout
	self.withDropout = np.prod( dropout ) < 1.0

	# random number generator
	if rng == None:
	self.rng = randomstreams( 0 )
	else:
	self.rng = rng

	# theano functions
	self.output = self.Tfunc_output()
	self.cost = self.Tfunc_cost()
	self.train = self.Tfunc_train()


	# theano op for output computation ( for test )
	def Top_output( self, X ):

	# input layer
	layer = self.Layers[0]
	Zprev = layer.Top_output( X )

	# hidden layers
	for layer in self.Layers[1:]:
	Y, Z = layer.Top_output( Zprev )
	Zprev = Z

	# output
	Zsoftmax = T.nnet.softmax( Zprev )

	return Zsoftmax



	# theano function for output computation ( for test )
	def Tfunc_output( self ):

	X = T.matrix() # N x D
	Z = self.Top_output( X )

	return theano.function( [ X ] , Z )


	# theano op for cost computation ( error term )
	def Top_cost( self, Z, lab ):

	cost = T.nnet.categorical_crossentropy( Z, lab )

	return T.mean( cost )


	# theano function for cost computation
	def Tfunc_cost( self ):

	Z = T.matrix() # N x K
	lab = T.ivector() # N-dim
	return theano.function( [ Z, lab ], self.Top_cost( Z, lab ) )



	# theano function for gradient descent learning
	def Tfunc_train( self ):

	X = T.matrix( 'X' ) # N x D
	lab = T.ivector( 'lab' ) # N-dim
	eta = T.scalar( 'eta' )
	mu = T.scalar( 'mu' )
	lam = T.scalar( 'lambda' )

	'''
	if self.withDropout:
	maskList = []
	'''

	# input layer
	layer = self.Layers[0]
	if self.withDropout:
	mask = layer.Top_generateMask( self.rng )
	Zprev = layer.Top_outputMasked( X, mask )
	#maskList.append( mask )
	else:
	Zprev = layer.Top_output( X )

	# hidden layers
	for layer in self.Layers[1:]:
	if self.withDropout:
	mask = layer.Top_generateMask( self.rng )
	Y, Z = layer.Top_outputMasked( Zprev, mask )
	#maskList.append( mask )
	else:
	Y, Z = layer.Top_output( Zprev )
	Zprev = Z

	# output & cost
	Z = T.nnet.softmax( Zprev )
	cost = self.Top_cost( Z, lab )

	# updatesList
	updatesList = []
	for i in range( len( self.Layers ) ):
	layer = self.Layers[i]
	if not isinstance( layer, InputLayer ):
	'''
	if self.withDropout:
	maskI, maskO = maskList[i-1], maskList[i]
	#updatesList += layer.T_updateMasked( cost, eta, mu, lam, maskO )
	updatesList += layer.T_updateMasked2( cost, eta, mu, lam, maskI, maskO )

	else:
	updatesList += layer.T_update( cost, eta, mu, lam )
	'''
	updatesList += layer.T_update( cost, eta, mu, lam )


	return theano.function( [ X, lab, eta, mu, lam ], [ Z, cost ], updates = updatesList )