takatakamanbou/convnet0211.py Secret

## convnet0211.py
import numpy as np
import theano
import theano.tensor as T
import theano.tensor.signal.downsample as Tsd

import nnet0211 as nnet


########## Convolution Layer ##########

class ConvLayer( object ):

    def __init__( self, Xdim, Wdim, afunc, withBias, Wini = 0.01 ):

        # dimension of the input
        Xnch, Xrow, Xcol = Xdim
        self.Xshape = Xdim

        # dimension of the convolution filters
        Wnch, Wrow, Wcol = Wdim
        self.Wshape = ( Wnch, Xnch, Wrow, Wcol )

        # dimension of the output
        Yrow, Ycol = Xrow - Wrow + 1, Xcol - Wcol + 1
        self.Yshape = ( Wnch, Yrow, Ycol )
        self.Dout = Wnch * Yrow * Ycol

        # activation function of the layer
        self.afunc = nnet.d_afunc[afunc]
        self.withBias = withBias

        # theano shared variables
        self.W  = theano.shared( nnet.randomN( self.Wshape, Wini ) )
        self.dW = theano.shared( np.zeros( self.Wshape ) )
        if withBias:
            self.b  = theano.shared( np.zeros( Wnch ) )
            self.db = theano.shared( np.zeros( Wnch ) )


    def output( self, X ):

        # X:  Ndat x Xshape,  Y:  Ndat x Yshape
        Xs = ( None, self.Xshape[0], self.Xshape[1], self.Xshape[2] )
        Ws = self.Wshape
        Y = T.nnet.conv.conv2d( X, self.W, image_shape = Xs, filter_shape = Ws )
        if self.withBias:
            b = self.b.dimshuffle( 'x', 0, 'x', 'x' ) # 1 x nch x 1 x 1
            Y += b
        Z = self.afunc( Y )

        return Y, Z  # Ndat x Yshape


########## Pooling Layer ##########

class PoolLayer( object ):

    def __init__( self, Xdim, ds, afunc, withBias ):

        # dimension of the input
        Xnch, Xrow, Xcol = Xdim
        self.Xshape = Xdim

        # parameters of the pooling layer
        self.ds = ds
        # assuming ignore_border = False
        Yrow = int( np.ceil( float( Xrow ) / ds[0] ) )
        Ycol = int( np.ceil( float( Xcol ) / ds[1] ) )
        self.Yshape = ( Xnch, Yrow, Ycol )
        self.Dout = Xnch * Yrow * Ycol

        # activation function of the layer
        self.afunc = nnet.d_afunc[afunc]
        self.withBias = withBias

        # theano shared variables
        if withBias:
            self.b  = theano.shared( np.zeros( Xnch ) )
            self.db = theano.shared( np.zeros( Xnch ) )


    def output( self, X ):

        # X:  Ndat x Xshape
        Y = Tsd.max_pool_2d( X, self.ds ) # Ndat x Yshape
        if self.withBias:
            b = self.b.dimshuffle( 'x', 0, 'x', 'x' ) # 1 x nch x 1 x 1
            Y += b
        Z = self.afunc( Y )

        return Y, Z


########## Full-Connection Layer ##########

class FullLayer( nnet.Layer ):

    def __init__( self, Din, Nunit, afunc, withBias = True, Wini = 0.01,
                  T4toMat = False ):

        super( FullLayer, self ).__init__( Din, Nunit, afunc, withBias, Wini )
        self.T4toMat = T4toMat


    def super_output( self, X ):

        return super( FullLayer, self ).output( X )


    def output( self, X ):

        if self.T4toMat:
            return self.super_output( X.reshape( ( X.shape[0], -1 ) ) )
        else:
            return self.super_output( X )


########## Convolutional Neural Net ##########

class CNN( object ):

    def __init__( self, Layers ):

        # layers - list of Layer instances
        self.Layers = Layers

        # theano functions
        self.output = self._Tfunc_output()
        self.cost   = self._Tfunc_cost()
        self.train  = self._Tfunc_train()


    ### theano function for output computation
    #
    def _Tfunc_output( self ):

        X = T.tensor4()  # Ndat x Xnch x Xrow x Xcol
        Y, Z = nnet._T_output( self.Layers, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### theano function for cost computation
    #
    def _Tfunc_cost( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        cost = nnet._T_cost( Z, t )

        return theano.function( [ Z, t ], cost )


    ### theano function for gradient descent learning
    #
    def _Tfunc_train( self ):

        X    = T.tensor4( 'X' )
        t    = T.dmatrix( 't' )
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        lam  = T.dscalar( 'lambda' )
        Y, Z = nnet._T_output( self.Layers, X )
        cost = T.mean( nnet._T_cost( Z, t ) )

        updatesList = []
        for layer in self.Layers:

            # PoolLayer doesn't have W & dW
            if not isinstance( layer, PoolLayer ):
                gradW = T.grad( cost, layer.W )
                #dWnew = -eta * gradW + mu * layer.dW
                dWnew = -eta * ( gradW + lam * layer.W ) + mu * layer.dW
                Wnew  = layer.W + dWnew
                updatesList.append( ( layer.W, Wnew ) )
                updatesList.append( ( layer.dW, dWnew ) )
            if layer.withBias:
                gradb = T.grad( cost, layer.b )
                # no weight decay for bias
                dbnew = -eta * gradb + mu * layer.db
                bnew  = layer.b + dbnew
                updatesList.append( ( layer.b, bnew ) )
                updatesList.append( ( layer.db, dbnew ) )


        return theano.function( [ X, t, eta, mu, lam ], cost, updates = updatesList )

## mnist_convnet0211.py
import numpy as np
import scipy as sp
import mnist0117 as mnist
import convnet0211 as convnet


def gendat( LT ):

    mn = mnist.MNIST( LT )
    label = mn.getLabel()
    N = label.shape[0]
    K = 10
    tmp = mn.getImage() / 255  # => in [0,1]
    X = tmp.reshape( ( tmp.shape[0], 1, tmp.shape[1], tmp.shape[2] ) )
    t = np.zeros( ( N, K ), dtype = bool )
    for ik in range( K ):
        t[label == ik, ik] = True

    return X, label, t


def errorrate( mlp, X, t, label ):

    Y, Z  = mlp.output( X )
    mnLL = np.mean( mlp.cost( Z, t ) )
    er = np.mean( label != np.argmax( Z, axis = 1 ) )

    return mnLL, er


# Conv-Pool-Softmax
def CPS( Xnch, Xrow, Xcol, K ):

    Xdim = ( Xnch, Xrow, Xcol )
    W1dim = ( 16, 5, 5 )
    ds1 = ( 4, 4 )
    L1conv = convnet.ConvLayer( Xdim, W1dim, 'linear', withBias = False )
    #L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'linear', withBias = False )
    L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'ReLu', withBias = True )
    H1 = L1pool.Dout
    L2 = convnet.FullLayer( H1, K, 'softmax', withBias = True, T4toMat = True )
    cnn = convnet.CNN( [ L1conv, L1pool, L2 ] )
    print '### Conv-Pool-Softmax   Xdim:', Xdim
    print '#   W1dim:', W1dim, ' ds1:', ds1, ' H1:', H1

    return cnn


# Conv-Pool-Conv-Pool-Softmax
def CPCPS( Xnch, Xrow, Xcol, K ):

    Xdim = ( Xnch, Xrow, Xcol )
    W1dim = ( 16, 5, 5 )
    ds1 = ( 4, 4 )
    #ds1 = ( 2, 2 )
    W2dim = ( 16, 5, 5 )
    ds2 = ( 4, 4 )
    #ds2 = ( 2, 2 )
    L1conv = convnet.ConvLayer( Xdim, W1dim, 'linear', withBias = False )
    #L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'linear', withBias = False )
    L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'ReLu', withBias = True )
    H1 = L1pool.Dout
    L2conv = convnet.ConvLayer( L1pool.Yshape, W2dim, 'linear', withBias = False )
    #L2pool = convnet.PoolLayer( L2conv.Yshape, ds2, 'linear', withBias = False )
    L2pool = convnet.PoolLayer( L2conv.Yshape, ds2, 'ReLu', withBias = True )
    H2 = L2pool.Dout
    L3 = convnet.FullLayer( H2, K, 'softmax', withBias = True, T4toMat = True )
    cnn = convnet.CNN( [ L1conv, L1pool, L2conv, L2pool, L3 ] )
    print '### Conv-Pool-Conv-Pool-Softmax   Xdim:', Xdim
    print '#   W1dim:', W1dim, ' ds1:', ds1, ' H1:', H1
    print '#   W2dim:', W2dim, ' ds2:', ds2, ' H2:', H2

    return cnn


# Conv-Pool-ReLu-Softmax
def CPRS( Xnch, Xrow, Xcol, K ):

    Xdim = ( Xnch, Xrow, Xcol )
    W1dim = ( 16, 5, 5 )
    #W1dim = ( 64, 5, 5 )
    ds1 = ( 4, 4 )
    L1conv = convnet.ConvLayer( Xdim, W1dim, 'linear', withBias = False )
    #L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'linear', withBias = False )
    L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'ReLu', withBias = True )
    H1 = L1pool.Dout
    H2 = 400
    L2 = convnet.FullLayer( H1, H2, 'ReLu', withBias = True, T4toMat = True )
    L3 = convnet.FullLayer( H2, K, 'softmax', withBias = True, T4toMat = False )
    cnn = convnet.CNN( [ L1conv, L1pool, L2, L3 ] )
    print '### Conv-Pool-ReLu-Softmax   Xdim:', Xdim
    print '#   W1dim:', W1dim, ' ds1:', ds1, ' H1:', H1
    print '#   H2:', H2

    return cnn


if __name__ == "__main__":


    np.random.seed( 0 )

    ##### setting the training data & the validation data
    #
    X, label, t = gendat( 'L' )
    xm = np.mean( X, axis = 0 )
    X -= xm
    XL, labelL, tL = X[:50000], label[:50000], t[:50000]
    XV, labelV, tV = X[50000:], label[50000:], t[50000:]
    NL, Xnch, Xrow, Xcol = XL.shape
    NV, Xnch, Xrow, Xcol = XV.shape
    K = t.shape[1]
    Xdim = ( Xrow, Xcol )

    ##### mini batch indicies for stochastic gradient ascent
    #
    idx = np.random.permutation( NL )
    batchsize = 100
    nbatch = NL / batchsize
    assert( NL % batchsize == 0 )
    idxB = np.zeros( ( nbatch, NL ), dtype = bool )
    for ib in range( nbatch ):
        idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


    ##### training
    #
    #cnn = CPS( Xnch, Xrow, Xcol, K )
    cnn = CPCPS( Xnch, Xrow, Xcol, K )
    #cnn = CPRS( Xnch, Xrow, Xcol, K )

    eta, mu, lam = 0.05, 0.9, 0.0
    nepoch = 50

    print '### training:   NL = ', NL, ' NV = ', NV, ' K = ', K, ' batchsize = ', batchsize

    for i in range( nepoch ):

        # printing error rates etc.
        if i % 10 == 0:
            mnLLL, erL = errorrate( cnn, XL, tL, labelL )
            mnLLV, erV = errorrate( cnn, XV, tV, labelV )
            print '%d %.4f %.2f  %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100 )

        # training (selecting each batch in random order)
        for ib in np.random.permutation( nbatch ):
            ii = idxB[ib, :]
            cnn.train( XL[ii], tL[ii], eta, mu, lam )

    i = nepoch

    mnLLL, erL = errorrate( cnn, XL, tL, labelL )
    mnLLV, erV = errorrate( cnn, XV, tV, labelV )
    print '%d %.4f %.2f  %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100 )


    ##### setting the test data
    #
    XT, labelT, tT = gendat( 'T' )
    XT -= xm
    NT, Nstack, Xrow, Xcol = XT.shape
    print '# NT = ', NT
    mnLLT, erT = errorrate( cnn, XT, tT, labelT )
    print '%d %.4f %.2f  %.4f %.2f  %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100, mnLLT, erT * 100 )

## mnist_mlp0211.py
import numpy as np
import scipy as sp
import mnist0117 as mnist
import nnet0211 as nnet


def gendat( LT ):

    mn = mnist.MNIST( LT )
    label = mn.getLabel()
    N = label.shape[0]
    K = 10
    X = mn.getImage().reshape( ( N, -1 ) ) / 255  # => in [0,1]
    t = np.zeros( ( N, K ), dtype = bool )
    for ik in range( K ):
        t[label == ik, ik] = True

    return X, label, t


def errorrate( mlp, X, t, label ):

    Y, Z  = mlp.output( X )
    mnLL = np.mean( mlp.cost( Z, t ) )
    er = np.mean( label != np.argmax( Z, axis = 1 ) )

    return mnLL, er


def MLP2( D, H1, K ):

    print '### 2-layer MLP: D =', D, ' H =', H1, ' K =', K
    L1 = nnet.Layer( D,  H1, 'ReLu',    withBias = True, Wini = 0.01 )
    L2 = nnet.Layer( H1, K,  'softmax', withBias = True, Wini = 0.01 )
    mlp = nnet.MLP( [ L1, L2 ] )

    return mlp


def MLP3( D, H1, H2, K ):

    print '### 3-layer MLP: D =', D, ' H1 =', H1, ' H2 =', H2, ' K =', K
    L1 = nnet.Layer( D,  H1, 'ReLu',    withBias = True, Wini = 0.01 )
    L2 = nnet.Layer( H1, H2, 'ReLu',    withBias = True, Wini = 0.01 )
    L3 = nnet.Layer( H2, K,  'softmax', withBias = True, Wini = 0.01 )
    mlp = nnet.MLP( [ L1, L2, L3 ] )

    return mlp


if __name__ == "__main__":

    np.random.seed( 0 )

    ##### setting the training data & the validation data
    #
    X, label, t = gendat( 'L' )
    xm = np.mean( X, axis = 0 )
    X -= xm
    XL, labelL, tL = X[:50000], label[:50000], t[:50000]
    XV, labelV, tV = X[50000:], label[50000:], t[50000:]
    NL, D = XL.shape
    NV, D = XV.shape
    K = t.shape[1]


    ##### mini batch indicies for stochastic gradient ascent
    #
    idx = np.random.permutation( NL )
    batchsize = 100
    nbatch = NL / batchsize
    assert( NL % batchsize == 0 )
    idxB = np.zeros( ( nbatch, NL ), dtype = bool )
    for ib in range( nbatch ):
        idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


    ##### training
    #
    #mlp = MLP2( D, 500, K )
    #mlp = MLP3( D, 500, 1000, K )
    mlp = MLP3( D, 1000, 500, K )

    eta    = 0.1
    mu     = 0.9
    lam    = 0.00001
    nepoch = 50

    print '### training:   NL = ', NL, 'NV = ', NV, ' D = ', D, ' K = ', K, ' batchsize = ', batchsize

    for i in range( nepoch ):

        # printing error rates etc.
        if i % 10 == 0:
            mnLLL, erL = errorrate( mlp, XL, tL, labelL )
            mnLLV, erV = errorrate( mlp, XV, tV, labelV )
            print '%d | %.4f %.2f | %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100 )

        # training (selecting each batch in random order)
        for ib in np.random.permutation( nbatch ):
            ii = idxB[ib, :]
            mlp.train( XL[ii], tL[ii], eta, mu, lam )

    i = nepoch

    mnLLL, erL = errorrate( mlp, XL, tL, labelL )
    mnLLV, erV = errorrate( mlp, XV, tV, labelV )
    print '%d | %.4f %.2f | %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100 )


    ##### setting the test data
    #
    XT, labelT, tT = gendat( 'T' )
    XT -= xm
    NT, D = XT.shape
    print '# NT = ', NT
    mnLLT, erT = errorrate( mlp, XT, tT, labelT )
    print '%d | %.4f %.2f | %.4f %.2f | %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100, mnLLT, erT * 100 )

## nnet0211.py
import numpy as np
import theano
import theano.tensor as T


# activation functions
d_afunc = { 'linear': lambda Y: Y,
            'sigmoid': T.nnet.sigmoid,
            'softmax': T.nnet.softmax,
            'ReLu': lambda Y: T.switch( Y > 0, Y, 0 ) }

### uniform random numbers for weight initialization
#
def randomU( shape, a ):

    # [ -a, a )
    return 2 * a * ( np.random.random_sample( shape ) - 0.5 )


### Gaussian random numbers for weight initialization
#
def randomN( shape, sig ):

    # N(0,sig)
    return sig * np.random.standard_normal( shape )


########## Layer ##########

class Layer( object ):

    def __init__( self, Din, Nunit, afunc, withBias = True, Wini = 0.01 ):

        self.Din   = Din
        self.Nunit = Nunit
        self.afunc = d_afunc[afunc]
        self.withBias = withBias

        # theano shared variables for weights & biases
        self.W = theano.shared( randomN( ( Nunit, Din ), Wini ) )
        self.dW = theano.shared( np.zeros( ( Nunit, Din ) ) )
        if withBias:
            self.b = theano.shared( np.zeros( Nunit ) )
            self.db = theano.shared( np.zeros( Nunit ) )


    def output( self, X ):

        if self.withBias:
            Y = T.dot( X, self.W.T ) + self.b  # Ndat x Nunit
        else:
            Y = T.dot( X, self.W.T )

        Z = self.afunc( Y )

        return Y, Z


########## MLP ##########

class MLP( object ):

    def __init__( self, Layers ):

        # layers - list of Layer instances
        self.Layers = Layers

        # theano functions
        self.output = self._Tfunc_output()
        self.cost   = self._Tfunc_cost()
        self.train  = self._Tfunc_train()


    ### theano function for output computation
    #
    def _Tfunc_output( self ):

        X = T.dmatrix()  # N x D
        Y, Z = _T_output( self.Layers, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### theano function for cost computation
    #
    def _Tfunc_cost( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        cost = _T_cost( Z, t )

        return theano.function( [ Z, t ], cost )


    ### theano function for gradient descent learning
    #
    def _Tfunc_train( self ):

        X    = T.dmatrix( 'X' )  # N x D
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        lam  = T.dscalar( 'lambda' )
        Y, Z = _T_output( self.Layers, X )
        cost = T.mean( _T_cost( Z, t ) )

        updatesList = []
        for layer in self.Layers:
            gradW = T.grad( cost, layer.W )
            #dWnew = -eta * gradW + mu * layer.dW
            dWnew = -eta * ( gradW + lam * layer.W ) + mu * layer.dW
            Wnew  = layer.W + dWnew
            updatesList.append( ( layer.W, Wnew ) )
            updatesList.append( ( layer.dW, dWnew ) )
            if layer.withBias:
                gradb = T.grad( cost, layer.b )
                # no weight decay for bias
                dbnew = -eta * gradb + mu * layer.db
                bnew  = layer.b + dbnew
                updatesList.append( ( layer.b, bnew ) )
                updatesList.append( ( layer.db, dbnew ) )

        return theano.function( [ X, t, eta, mu, lam ], cost, updates = updatesList )


def _T_output( Layers, X ):

    Zprev = X
    for layer in Layers:
        Y, Z = layer.output( Zprev )
        Zprev = Z

    return Y, Z


def _T_cost( Z, t ):

    return T.nnet.categorical_crossentropy( Z, t )
	import numpy as np
	import theano
	import theano.tensor as T
	import theano.tensor.signal.downsample as Tsd

	import nnet0211 as nnet


	########## Convolution Layer ##########

	class ConvLayer( object ):

	def __init__( self, Xdim, Wdim, afunc, withBias, Wini = 0.01 ):

	# dimension of the input
	Xnch, Xrow, Xcol = Xdim
	self.Xshape = Xdim

	# dimension of the convolution filters
	Wnch, Wrow, Wcol = Wdim
	self.Wshape = ( Wnch, Xnch, Wrow, Wcol )

	# dimension of the output
	Yrow, Ycol = Xrow - Wrow + 1, Xcol - Wcol + 1
	self.Yshape = ( Wnch, Yrow, Ycol )
	self.Dout = Wnch * Yrow * Ycol

	# activation function of the layer
	self.afunc = nnet.d_afunc[afunc]
	self.withBias = withBias

	# theano shared variables
	self.W = theano.shared( nnet.randomN( self.Wshape, Wini ) )
	self.dW = theano.shared( np.zeros( self.Wshape ) )
	if withBias:
	self.b = theano.shared( np.zeros( Wnch ) )
	self.db = theano.shared( np.zeros( Wnch ) )


	def output( self, X ):

	# X: Ndat x Xshape, Y: Ndat x Yshape
	Xs = ( None, self.Xshape[0], self.Xshape[1], self.Xshape[2] )
	Ws = self.Wshape
	Y = T.nnet.conv.conv2d( X, self.W, image_shape = Xs, filter_shape = Ws )
	if self.withBias:
	b = self.b.dimshuffle( 'x', 0, 'x', 'x' ) # 1 x nch x 1 x 1
	Y += b
	Z = self.afunc( Y )

	return Y, Z # Ndat x Yshape



	########## Pooling Layer ##########

	class PoolLayer( object ):

	def __init__( self, Xdim, ds, afunc, withBias ):

	# dimension of the input
	Xnch, Xrow, Xcol = Xdim
	self.Xshape = Xdim

	# parameters of the pooling layer
	self.ds = ds
	# assuming ignore_border = False
	Yrow = int( np.ceil( float( Xrow ) / ds[0] ) )
	Ycol = int( np.ceil( float( Xcol ) / ds[1] ) )
	self.Yshape = ( Xnch, Yrow, Ycol )
	self.Dout = Xnch * Yrow * Ycol

	# activation function of the layer
	self.afunc = nnet.d_afunc[afunc]
	self.withBias = withBias

	# theano shared variables
	if withBias:
	self.b = theano.shared( np.zeros( Xnch ) )
	self.db = theano.shared( np.zeros( Xnch ) )


	def output( self, X ):

	# X: Ndat x Xshape
	Y = Tsd.max_pool_2d( X, self.ds ) # Ndat x Yshape
	if self.withBias:
	b = self.b.dimshuffle( 'x', 0, 'x', 'x' ) # 1 x nch x 1 x 1
	Y += b
	Z = self.afunc( Y )

	return Y, Z


	########## Full-Connection Layer ##########

	class FullLayer( nnet.Layer ):

	def __init__( self, Din, Nunit, afunc, withBias = True, Wini = 0.01,
	T4toMat = False ):

	super( FullLayer, self ).__init__( Din, Nunit, afunc, withBias, Wini )
	self.T4toMat = T4toMat


	def super_output( self, X ):

	return super( FullLayer, self ).output( X )


	def output( self, X ):

	if self.T4toMat:
	return self.super_output( X.reshape( ( X.shape[0], -1 ) ) )
	else:
	return self.super_output( X )


	########## Convolutional Neural Net ##########

	class CNN( object ):

	def __init__( self, Layers ):

	# layers - list of Layer instances
	self.Layers = Layers

	# theano functions
	self.output = self._Tfunc_output()
	self.cost = self._Tfunc_cost()
	self.train = self._Tfunc_train()


	### theano function for output computation
	#
	def _Tfunc_output( self ):

	X = T.tensor4() # Ndat x Xnch x Xrow x Xcol
	Y, Z = nnet._T_output( self.Layers, X )

	return theano.function( [ X ], [ Y, Z ] )


	### theano function for cost computation
	#
	def _Tfunc_cost( self ):

	Z = T.dmatrix() # N x K
	t = T.dmatrix() # N x K
	cost = nnet._T_cost( Z, t )

	return theano.function( [ Z, t ], cost )


	### theano function for gradient descent learning
	#
	def _Tfunc_train( self ):

	X = T.tensor4( 'X' )
	t = T.dmatrix( 't' )
	eta = T.dscalar( 'eta' )
	mu = T.dscalar( 'mu' )
	lam = T.dscalar( 'lambda' )
	Y, Z = nnet._T_output( self.Layers, X )
	cost = T.mean( nnet._T_cost( Z, t ) )

	updatesList = []
	for layer in self.Layers:

	# PoolLayer doesn't have W & dW
	if not isinstance( layer, PoolLayer ):
	gradW = T.grad( cost, layer.W )
	#dWnew = -eta * gradW + mu * layer.dW
	dWnew = -eta * ( gradW + lam * layer.W ) + mu * layer.dW
	Wnew = layer.W + dWnew
	updatesList.append( ( layer.W, Wnew ) )
	updatesList.append( ( layer.dW, dWnew ) )
	if layer.withBias:
	gradb = T.grad( cost, layer.b )
	# no weight decay for bias
	dbnew = -eta * gradb + mu * layer.db
	bnew = layer.b + dbnew
	updatesList.append( ( layer.b, bnew ) )
	updatesList.append( ( layer.db, dbnew ) )


	return theano.function( [ X, t, eta, mu, lam ], cost, updates = updatesList )
	import numpy as np
	import scipy as sp
	import mnist0117 as mnist
	import convnet0211 as convnet


	def gendat( LT ):

	mn = mnist.MNIST( LT )
	label = mn.getLabel()
	N = label.shape[0]
	K = 10
	tmp = mn.getImage() / 255 # => in [0,1]
	X = tmp.reshape( ( tmp.shape[0], 1, tmp.shape[1], tmp.shape[2] ) )
	t = np.zeros( ( N, K ), dtype = bool )
	for ik in range( K ):
	t[label == ik, ik] = True

	return X, label, t


	def errorrate( mlp, X, t, label ):

	Y, Z = mlp.output( X )
	mnLL = np.mean( mlp.cost( Z, t ) )
	er = np.mean( label != np.argmax( Z, axis = 1 ) )

	return mnLL, er


	# Conv-Pool-Softmax
	def CPS( Xnch, Xrow, Xcol, K ):

	Xdim = ( Xnch, Xrow, Xcol )
	W1dim = ( 16, 5, 5 )
	ds1 = ( 4, 4 )
	L1conv = convnet.ConvLayer( Xdim, W1dim, 'linear', withBias = False )
	#L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'linear', withBias = False )
	L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'ReLu', withBias = True )
	H1 = L1pool.Dout
	L2 = convnet.FullLayer( H1, K, 'softmax', withBias = True, T4toMat = True )
	cnn = convnet.CNN( [ L1conv, L1pool, L2 ] )
	print '### Conv-Pool-Softmax Xdim:', Xdim
	print '# W1dim:', W1dim, ' ds1:', ds1, ' H1:', H1

	return cnn


	# Conv-Pool-Conv-Pool-Softmax
	def CPCPS( Xnch, Xrow, Xcol, K ):

	Xdim = ( Xnch, Xrow, Xcol )
	W1dim = ( 16, 5, 5 )
	ds1 = ( 4, 4 )
	#ds1 = ( 2, 2 )
	W2dim = ( 16, 5, 5 )
	ds2 = ( 4, 4 )
	#ds2 = ( 2, 2 )
	L1conv = convnet.ConvLayer( Xdim, W1dim, 'linear', withBias = False )
	#L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'linear', withBias = False )
	L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'ReLu', withBias = True )
	H1 = L1pool.Dout
	L2conv = convnet.ConvLayer( L1pool.Yshape, W2dim, 'linear', withBias = False )
	#L2pool = convnet.PoolLayer( L2conv.Yshape, ds2, 'linear', withBias = False )
	L2pool = convnet.PoolLayer( L2conv.Yshape, ds2, 'ReLu', withBias = True )
	H2 = L2pool.Dout
	L3 = convnet.FullLayer( H2, K, 'softmax', withBias = True, T4toMat = True )
	cnn = convnet.CNN( [ L1conv, L1pool, L2conv, L2pool, L3 ] )
	print '### Conv-Pool-Conv-Pool-Softmax Xdim:', Xdim
	print '# W1dim:', W1dim, ' ds1:', ds1, ' H1:', H1
	print '# W2dim:', W2dim, ' ds2:', ds2, ' H2:', H2

	return cnn


	# Conv-Pool-ReLu-Softmax
	def CPRS( Xnch, Xrow, Xcol, K ):

	Xdim = ( Xnch, Xrow, Xcol )
	W1dim = ( 16, 5, 5 )
	#W1dim = ( 64, 5, 5 )
	ds1 = ( 4, 4 )
	L1conv = convnet.ConvLayer( Xdim, W1dim, 'linear', withBias = False )
	#L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'linear', withBias = False )
	L1pool = convnet.PoolLayer( L1conv.Yshape, ds1, 'ReLu', withBias = True )
	H1 = L1pool.Dout
	H2 = 400
	L2 = convnet.FullLayer( H1, H2, 'ReLu', withBias = True, T4toMat = True )
	L3 = convnet.FullLayer( H2, K, 'softmax', withBias = True, T4toMat = False )
	cnn = convnet.CNN( [ L1conv, L1pool, L2, L3 ] )
	print '### Conv-Pool-ReLu-Softmax Xdim:', Xdim
	print '# W1dim:', W1dim, ' ds1:', ds1, ' H1:', H1
	print '# H2:', H2

	return cnn


	if __name__ == "__main__":


	np.random.seed( 0 )

	##### setting the training data & the validation data
	#
	X, label, t = gendat( 'L' )
	xm = np.mean( X, axis = 0 )
	X -= xm
	XL, labelL, tL = X[:50000], label[:50000], t[:50000]
	XV, labelV, tV = X[50000:], label[50000:], t[50000:]
	NL, Xnch, Xrow, Xcol = XL.shape
	NV, Xnch, Xrow, Xcol = XV.shape
	K = t.shape[1]
	Xdim = ( Xrow, Xcol )

	##### mini batch indicies for stochastic gradient ascent
	#
	idx = np.random.permutation( NL )
	batchsize = 100
	nbatch = NL / batchsize
	assert( NL % batchsize == 0 )
	idxB = np.zeros( ( nbatch, NL ), dtype = bool )
	for ib in range( nbatch ):
	idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


	##### training
	#
	#cnn = CPS( Xnch, Xrow, Xcol, K )
	cnn = CPCPS( Xnch, Xrow, Xcol, K )
	#cnn = CPRS( Xnch, Xrow, Xcol, K )

	eta, mu, lam = 0.05, 0.9, 0.0
	nepoch = 50

	print '### training: NL = ', NL, ' NV = ', NV, ' K = ', K, ' batchsize = ', batchsize

	for i in range( nepoch ):

	# printing error rates etc.
	if i % 10 == 0:
	mnLLL, erL = errorrate( cnn, XL, tL, labelL )
	mnLLV, erV = errorrate( cnn, XV, tV, labelV )
	print '%d %.4f %.2f %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100 )

	# training (selecting each batch in random order)
	for ib in np.random.permutation( nbatch ):
	ii = idxB[ib, :]
	cnn.train( XL[ii], tL[ii], eta, mu, lam )

	i = nepoch

	mnLLL, erL = errorrate( cnn, XL, tL, labelL )
	mnLLV, erV = errorrate( cnn, XV, tV, labelV )
	print '%d %.4f %.2f %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100 )


	##### setting the test data
	#
	XT, labelT, tT = gendat( 'T' )
	XT -= xm
	NT, Nstack, Xrow, Xcol = XT.shape
	print '# NT = ', NT
	mnLLT, erT = errorrate( cnn, XT, tT, labelT )
	print '%d %.4f %.2f %.4f %.2f %.4f %.2f' % ( i, mnLLL, erL * 100, mnLLV, erV * 100, mnLLT, erT * 100 )