takatakamanbou/convnet0207.py Secret

## convnet0207.py
import numpy as np
import theano
import theano.tensor as T
import theano.tensor.signal.downsample as Tsd

import nnet0207 as nnet


class Layer():

    # afunc: activation function (see nnet)
    # Xdim:  dimension of the input image ( Xrow, Xcol )
    # Xnch:  number of the input channels
    # Wdim:  dimension of the convolution filters ( Wrow, Wcol )
    # Wnch:  number of the filter channels
    # ds:    downsampling scale for max-pooling ( ds_vertical, ds_horizontal )
    # Wini_range: parameter for weight initialization (see nnet)
    #
    def __init__( self, afunc, Xdim, Xnch, Wdim, Wnch, ds, Wini_range ):


        # parameters of the input
        Xrow, Xcol = Xdim
        Xshape = ( Xnch, Xrow, Xcol )
        self.Xshape = Xshape

        # parameters of the convolution layer
        Wrow, Wcol = Wdim
        Wshape = ( Wnch, Xnch, Wrow, Wcol )
        self.Wshape = Wshape
        self.ds = ds
        Yrow, Ycol = Xrow - Wrow + 1, Xcol - Wcol + 1
        Yshape = ( Wnch, Yrow, Ycol )
        self.Yshape = Yshape

        # parameters of the pooling layer
        Zrow = int( np.ceil( float( Yrow ) / ds[0] ) )
        Zcol = int( np.ceil( float( Ycol ) / ds[1] ) )
        Zshape = ( Wnch, Zrow, Zcol )
        self.Zshape = Zshape
        self.Dout = Wnch * Zrow * Zcol

        # theano shared variables
        self.W  = theano.shared( nnet.random( Wshape, Wini_range ) )
        self.dW = theano.shared( np.zeros( Wshape ) )

        # activation function of the layer
        self.afunc = nnet.d_afunc[afunc]


    def output( self, X ):

        # X:  Ndat x Xnch x Xrow x Xcol
        Xshape = ( None, self.Xshape[0], self.Xshape[1], self.Xshape[2] )
        Wshape = self.Wshape
        Yconv = T.nnet.conv.conv2d( X, self.W, image_shape = Xshape, filter_shape = Wshape ) # Ndat x Wnch x Yrow x Ycol
        Ypool = Tsd.max_pool_2d( Yconv, self.ds ) # Ndat x Wnch x Zrow x Zcol
        if self.afunc == 'linear':
            Z = Ypool
        else:
            Z = self.afunc( Ypool )

        return Z

## convnet_2layer0207.py
import numpy as np
import theano
import theano.tensor as T

import nnet0207 as nnet
import convnet0207 as convnet

class MLP():

    # Xdim: ( Xrow, Xcol )   W1dim: ( W1row, W1col )
    # ds1:  ( ds_v, ds_h ) downscale factor
    #
    def __init__( self, Xdim, Xnch, W1dim, W1nch, ds1, K ):

        # layers
        self.L1 = convnet.Layer( 'linear', Xdim, Xnch, W1dim, W1nch, ds1, 0.1 )
        #self.L1 = convnet.Layer( 'ReLu', Xdim, Xnch, W1dim, W1nch, ds1, 0.1 )
        self.H = self.L1.Dout
        self.L2 = nnet.Layer( 'softmax', self.H, K, 0.1 )

        # theano functions
        self.output = self._Tfunc_output()
        self.cost   = self._Tfunc_cost()
        self.train  = self._Tfunc_train()


    ### theano function for output computation
    #
    def _Tfunc_output( self ):

        X = T.tensor4()  # Ndat x Xnch x Xrow x Xcol
        Y, Z = _T_output( self.L1, self.L2, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### theano function for cost computation
    #
    def _Tfunc_cost( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        cost = _T_cost( Z, t )

        return theano.function( [ Z, t ], cost )


    ### theano function for gradient descent learning
    #
    def _Tfunc_train( self ):

        W1, dW1          = self.L1.W, self.L1.dW
        W2, dW2, b2, db2 = self.L2.W, self.L2.dW, self.L2.b, self.L2.db
        X    = T.tensor4( 'X' )
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        Y2, Z2 = _T_output( self.L1, self.L2, X )
        cost = T.mean( _T_cost( Z2, t ) )
        gradW1, gradW2, gradb2 = T.grad( cost, [ W1, W2, b2 ] )

        dW1_new = -eta * gradW1 + mu * dW1
        dW2_new = -eta * gradW2 + mu * dW2
        db2_new = -eta * gradb2 + mu * db2
        W1_new = W1 + dW1_new
        W2_new = W2 + dW2_new
        b2_new = b2 + db2_new
        updatesList = [
            ( W1, W1_new ), ( dW1, dW1_new ),
            ( W2, W2_new ), ( b2, b2_new ), ( dW2, dW2_new ), ( db2, db2_new ) ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )


def _T_output( L1, L2, X ):

    Z1 = L1.output( X )
    Y2, Z2 = L2.output( Z1.reshape( ( Z1.shape[0], -1 ) ) )

    return Y2, Z2


def _T_cost( Z, t ):

    return T.nnet.categorical_crossentropy( Z, t )

## convnet_3layer0207.py
import numpy as np
import theano
import theano.tensor as T

import nnet0207 as nnet
import convnet0207 as convnet

class MLP():

    # Xdim: ( Xrow, Xcol )
    # W1dim: ( Wrow, Wcol )   ds1:  ( ds_v, ds_h ) downscale factor
    # W2dim: ( Wrow, Wcol )   ds2:  ( ds_v, ds_h ) downscale factor
    #
    def __init__( self, Xdim, Xnch, W1dim, W1nch, ds1, W2dim, W2nch, ds2, K ):

        # layers
        self.L1 = convnet.Layer( 'linear', Xdim, Xnch, W1dim, W1nch, ds1, 0.1 )
        Z1nch = self.L1.Zshape[0]
        Z1dim = self.L1.Zshape[1:]
        self.H1 = self.L1.Dout
        self.L2 = convnet.Layer( 'linear', Z1dim, Z1nch, W2dim, W2nch, ds2, 0.1 )
        self.H2 = self.L2.Dout
        self.L3 = nnet.Layer( 'softmax', self.H2, K, 0.1 )

        # theano functions
        self.output = self._Tfunc_output()
        self.cost   = self._Tfunc_cost()
        self.train  = self._Tfunc_train()


    ### theano function for output computation
    #
    def _Tfunc_output( self ):

        X = T.tensor4()  # Ndat x Xnch x Xrow x Xcol
        Y, Z = _T_output( self.L1, self.L2, self.L3, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### theano function for cost computation
    #
    def _Tfunc_cost( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        cost = _T_cost( Z, t )

        return theano.function( [ Z, t ], cost )


    ### theano function for gradient descent learning
    #
    def _Tfunc_train( self ):

        W1, dW1          = self.L1.W, self.L1.dW
        W2, dW2          = self.L2.W, self.L2.dW
        W3, dW3, b3, db3 = self.L3.W, self.L3.dW, self.L3.b, self.L3.db
        X    = T.tensor4( 'X' )
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        Y3, Z3 = _T_output( self.L1, self.L2, self.L3, X )
        cost = T.mean( _T_cost( Z3, t ) )
        gradW1, gradW2, gradW3, gradb3 = T.grad( cost, [ W1, W2, W3, b3 ] )

        dW1_new = -eta * gradW1 + mu * dW1
        dW2_new = -eta * gradW2 + mu * dW2
        dW3_new = -eta * gradW3 + mu * dW3
        db3_new = -eta * gradb3 + mu * db3
        W1_new = W1 + dW1_new
        W2_new = W2 + dW2_new
        W3_new = W3 + dW3_new
        b3_new = b3 + db3_new
        updatesList = [
            ( W1, W1_new ), ( dW1, dW1_new ),
            ( W2, W2_new ), ( dW2, dW2_new ),
            ( W3, W3_new ), ( b3, b3_new ), ( dW3, dW3_new ), ( db3, db3_new ) ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )


def _T_output( L1, L2, L3, X ):

    Z1 = L1.output( X )
    Z2 = L2.output( Z1 )
    Y3, Z3 = L3.output( Z2.reshape( ( Z2.shape[0], -1 ) ) )

    return Y3, Z3


def _T_cost( Z, t ):

    return T.nnet.categorical_crossentropy( Z, t )

## mlp_2layer0207.py
import numpy as np
import theano
import theano.tensor as T

import nnet0207 as nnet

class MLP():

    def __init__( self, D, H, K ):

        # layers
        self.L1 = nnet.Layer( 'ReLu', D, H, 0.1 )
        self.L2 = nnet.Layer( 'softmax', H, K, 0.1 )

        # theano functions
        self.output = self._Tfunc_output()
        self.cost   = self._Tfunc_cost()
        self.train  = self._Tfunc_train()


    ### theano function for output computation
    #
    def _Tfunc_output( self ):

        X = T.dmatrix()  # N x D
        Y, Z = _T_output( self.L1, self.L2, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### theano function for cost computation
    #
    def _Tfunc_cost( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        cost = _T_cost( Z, t )

        return theano.function( [ Z, t ], cost )


    ### theano function for gradient descent learning
    #
    def _Tfunc_train( self ):

        W1, dW1, b1, db1 = self.L1.W, self.L1.dW, self.L1.b, self.L1.db
        W2, dW2, b2, db2 = self.L2.W, self.L2.dW, self.L2.b, self.L2.db
        X    = T.dmatrix( 'X' )  # N x D
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        Y2, Z2 = _T_output( self.L1, self.L2, X )
        cost = T.mean( _T_cost( Z2, t ) )
        gradW1, gradb1, gradW2, gradb2 = T.grad( cost, [ W1, b1, W2, b2 ] )

        dW1_new = -eta * gradW1 + mu * dW1
        db1_new = -eta * gradb1 + mu * db1
        dW2_new = -eta * gradW2 + mu * dW2
        db2_new = -eta * gradb2 + mu * db2
        W1_new = W1 + dW1_new
        b1_new = b1 + db1_new
        W2_new = W2 + dW2_new
        b2_new = b2 + db2_new
        updatesList = [
            ( W1, W1_new ), ( b1, b1_new ), ( dW1, dW1_new ), ( db1, db1_new ),
            ( W2, W2_new ), ( b2, b2_new ), ( dW2, dW2_new ), ( db2, db2_new ) ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )


def _T_output( L1, L2, X ):

    Y1, Z1 = L1.output( X )
    Y2, Z2 = L2.output( Z1 )

    return Y2, Z2


def _T_cost( Z, t ):

    return T.nnet.categorical_crossentropy( Z, t )

## mlp_3layer0207.py
import numpy as np
import theano
import theano.tensor as T

import nnet0207 as nnet

class MLP():

    def __init__( self, D, H1, H2, K ):

        # layers
        self.L1 = nnet.Layer( 'ReLu', D, H1, 0.1 )
        self.L2 = nnet.Layer( 'ReLu', H1, H2, 0.1 )
        self.L3 = nnet.Layer( 'softmax', H2, K, 0.1 )

        # theano functions
        self.output = self._Tfunc_output()
        self.cost   = self._Tfunc_cost()
        self.train  = self._Tfunc_train()


    ### theano function for output computation
    #
    def _Tfunc_output( self ):

        X = T.dmatrix()  # N x D
        Y, Z = _T_output( self.L1, self.L2, self.L3, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### theano function for cost computation
    #
    def _Tfunc_cost( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        cost = _T_cost( Z, t )

        return theano.function( [ Z, t ], cost )


    ### theano function for gradient descent learning
    #
    def _Tfunc_train( self ):

        W1, dW1, b1, db1 = self.L1.W, self.L1.dW, self.L1.b, self.L1.db
        W2, dW2, b2, db2 = self.L2.W, self.L2.dW, self.L2.b, self.L2.db
        W3, dW3, b3, db3 = self.L3.W, self.L3.dW, self.L3.b, self.L3.db
        X    = T.dmatrix( 'X' )  # N x D
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        Y3, Z3 = _T_output( self.L1, self.L2, self.L3, X )
        cost = T.mean( _T_cost( Z3, t ) )
        grad = T.grad( cost, [ W1, b1, W2, b2, W3, b3 ] )
        gradW1, gradb1, gradW2, gradb2, gradW3, gradb3 = grad

        dW1_new = -eta * gradW1 + mu * dW1
        db1_new = -eta * gradb1 + mu * db1
        dW2_new = -eta * gradW2 + mu * dW2
        db2_new = -eta * gradb2 + mu * db2
        dW3_new = -eta * gradW3 + mu * dW3
        db3_new = -eta * gradb3 + mu * db3
        W1_new = W1 + dW1_new
        b1_new = b1 + db1_new
        W2_new = W2 + dW2_new
        b2_new = b2 + db2_new
        W3_new = W3 + dW3_new
        b3_new = b3 + db3_new
        updatesList = [
            ( W1, W1_new ), ( b1, b1_new ), ( dW1, dW1_new ), ( db1, db1_new ),
            ( W2, W2_new ), ( b2, b2_new ), ( dW2, dW2_new ), ( db2, db2_new ),
            ( W3, W3_new ), ( b3, b3_new ), ( dW3, dW3_new ), ( db3, db3_new ),
        ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )


def _T_output( L1, L2, L3, X ):

    Y1, Z1 = L1.output( X )
    Y2, Z2 = L2.output( Z1 )
    Y3, Z3 = L3.output( Z2 )

    return Y3, Z3


def _T_cost( Z, t ):

    return T.nnet.categorical_crossentropy( Z, t )

## mnist_convnet0207.py
import numpy as np
import scipy as sp
import mnist0117 as mnist
import convnet_2layer0207 as convnet_2layer
import convnet_3layer0207 as convnet_3layer


def gendat( LT ):

    mn = mnist.MNIST( LT )
    label = mn.getLabel()
    N = label.shape[0]
    K = 10
    tmp = mn.getImage() / 255  # => in [0,1]
    X = tmp.reshape( ( tmp.shape[0], 1, tmp.shape[1], tmp.shape[2] ) )
    t = np.zeros( ( N, K ), dtype = bool )
    for ik in range( K ):
        t[label == ik, ik] = True

    return X, label, t


def errorrate( mlp, X, t, label ):

    Y, Z  = mlp.output( X )
    mnLL = np.mean( mlp.cost( Z, t ) )
    er = np.mean( label != np.argmax( Z, axis = 1 ) )

    return mnLL, er


if __name__ == "__main__":


    np.random.seed( 0 )

    ##### setting the training data & the validation data
    #
    X, label, t = gendat( 'L' )
    XL, labelL, tL = X[:50000], label[:50000], t[:50000]
    XV, labelV, tV = X[50000:], label[50000:], t[50000:]
    NL, Xnch, Xrow, Xcol = XL.shape
    NV, Xnch, Xrow, Xcol = XV.shape
    K = t.shape[1]
    Xdim = ( Xrow, Xcol )

    ##### mini batch indicies for stochastic gradient ascent
    #
    idx = np.random.permutation( NL )
    batchsize = 100
    nbatch = NL / batchsize
    assert( NL % batchsize == 0 )
    idxB = np.zeros( ( nbatch, NL ), dtype = bool )
    for ib in range( nbatch ):
        idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


    ##### training
    #
    W1dim, W1nch, ds1 = ( 5, 5 ), 16, ( 4, 4 )
    W2dim, W2nch, ds2 = None, None, None
    #W2dim, W2nch, ds2 = ( 5, 5 ), 16, ( 4, 4 )
    eta, mu = 0.05, 0.8
    nepoch = 50
    if W2dim == None:
        mlp = convnet_2layer.MLP( Xdim, Xnch, W1dim, W1nch, ds1, K )
        print '### 2-layer convnet'
        print '#   Xdim:', Xdim, ' Xnch:', Xnch, ' W1dim:', W1dim, ' W1nch:', W1nch, ' ds1:', ds1, ' H:', mlp.H
    else:
        mlp = convnet_3layer.MLP( Xdim, Xnch, W1dim, W1nch, ds1, W2dim, W2nch, ds2, K )
        print '### 3-layer convnet'
        print '#   Xdim:', Xdim, ' Xnch:', Xnch, ' W1dim:', W1dim, ' W1nch:', W1nch, ' ds1:', ds1, ' H1:', mlp.H1
        print '#   W2dim:', W2dim, ' W2nch:', W2nch, ' ds2:', ds2, ' H2:', mlp.H2

    print '### training:   NL = ', NL, ' NV = ', NV, ' K = ', K, ' batchsize = ', batchsize


    for i in range( nepoch ):

        # printing error rates etc.
        if i % 10 == 0:
            mnLLL, erL = errorrate( mlp, XL, tL, labelL )
            mnLLV, erV = errorrate( mlp, XV, tV, labelV )
            print i, mnLLL, erL * 100, erV * 100

        # training (selecting each batch in random order)
        for ib in np.random.permutation( nbatch ):
            ii = idxB[ib, :]
            mlp.train( XL[ii], tL[ii], eta, mu )

    i = nepoch

    mnLLL, erL = errorrate( mlp, XL, tL, labelL )
    mnLLV, erV = errorrate( mlp, XV, tV, labelV )
    print i, mnLLL, erL * 100, erV * 100


    ##### setting the test data
    #
    XT, labelT, tT = gendat( 'T' )
    NT, Nstack, Xrow, Xcol = XT.shape
    print '# NT = ', NT
    mnLLT, erT = errorrate( mlp, XT, tT, labelT )
    print i, mnLLT, erL * 100, erV * 100, erT * 100

## mnist_mlp0207.py
import numpy as np
import scipy as sp
import mnist0117 as mnist

def gendat( LT ):

    mn = mnist.MNIST( LT )
    label = mn.getLabel()
    N = label.shape[0]
    K = 10
    X = mn.getImage().reshape( ( N, -1 ) ) / 255  # => in [0,1]
    t = np.zeros( ( N, K ), dtype = bool )
    for ik in range( K ):
        t[label == ik, ik] = True

    return X, label, t


def errorrate( mlp, X, t, label ):

    Y, Z  = mlp.output( X )
    mnLL = np.mean( mlp.cost( Z, t ) )
    er = np.mean( label != np.argmax( Z, axis = 1 ) )

    return mnLL, er


if __name__ == "__main__":

    import mlp_2layer0207 as mlp_2layer
    import mlp_3layer0207 as mlp_3layer

    np.random.seed( 0 )

    ##### setting the training data & the validation data
    #
    X, label, t = gendat( 'L' )
    XL, labelL, tL = X[:50000], label[:50000], t[:50000]
    XV, labelV, tV = X[50000:], label[50000:], t[50000:]
    NL, D = XL.shape
    NV, D = XV.shape
    K = t.shape[1]


    ##### mini batch indicies for stochastic gradient ascent
    #
    idx = np.random.permutation( NL )
    batchsize = 1000
    nbatch = NL / batchsize
    assert( NL % batchsize == 0 )
    idxB = np.zeros( ( nbatch, NL ), dtype = bool )
    for ib in range( nbatch ):
        idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


    ##### training
    #
    #H1, H2 = 500, 0
    H1, H2 = 500, 1000
    #H1, H2 = 1000, 500
    eta    = 0.5
    mu     = 0.8
    nepoch = 20
    if H2 <= 0:
        mlp = mlp_2layer.MLP( D, H1, K )
        print '### 2-layer MLP: D = ', D, ' H = ', H1, ' K = ', K
    else:
        mlp = mlp_3layer.MLP( D, H1, H2, K )
        print '### 3-layer MLP: D = ', D, ' H1 = ', H1, ' H2 = ', H2, ' K = ', K

    print '### training:   NL = ', NL, 'NV = ', NV, ' D = ', D, ' K = ', K, ' batchsize = ', batchsize

    for i in range( nepoch ):

        # printing error rates etc.
        if i % 10 == 0:
            mnLLL, erL = errorrate( mlp, XL, tL, labelL )
            mnLLV, erV = errorrate( mlp, XV, tV, labelV )
            print i, mnLLL, erL * 100, erV * 100

        # training (selecting each batch in random order)
        for ib in np.random.permutation( nbatch ):
            ii = idxB[ib, :]
            mlp.train( XL[ii], tL[ii], eta, mu )

    i = nepoch

    mnLLL, erL = errorrate( mlp, XL, tL, labelL )
    mnLLV, erV = errorrate( mlp, XV, tV, labelV )
    print i, mnLLL, erL * 100, erV * 100


    ##### setting the test data
    #
    XT, labelT, tT = gendat( 'T' )
    NT, D = XT.shape
    print '# NT = ', NT
    mnLLT, erT = errorrate( mlp, XT, tT, labelT )
    print i, mnLLT, erL * 100, erV * 100, erT * 100

## nnet0207.py
import numpy as np
import theano
import theano.tensor as T

d_afunc = { 'linear': 'linear',
            'sigmoid': T.nnet.sigmoid,
            'softmax': T.nnet.softmax,
            'ReLu': lambda Y: T.switch( Y > 0, Y, 0 ) }

class Layer():

    def __init__( self, afunc, Din, Nunit, Wini_range ):

        self.Din   = Din
        self.Nunit = Nunit

        # theano shared variables for weights & biases
        self.W = theano.shared( random( ( Nunit, Din ), Wini_range ) )
        self.b = theano.shared( random( Nunit, Wini_range ) )
        self.dW = theano.shared( np.zeros( ( Nunit, Din ) ) )
        self.db = theano.shared( np.zeros( Nunit ) )

        # activation function of the layer
        self.afunc = d_afunc[afunc]


    def output( self, X ):

        Y = T.dot( X, self.W.T ) + self.b  # Ndat x Nunit
        if self.afunc == 'linear':
            Z = Y
        else:
            Z = self.afunc( Y )

        return Y, Z


### random numbers for weight initialization
#
def random( shape, r ):

    # [ -r/2, r/2 )
    return r * ( np.random.random_sample( shape ) - 0.5 )


### Rectified Linear activation function
#
def relu( Y ):

    return T.switch( Y > 0, Y, 0 )
	import numpy as np
	import theano
	import theano.tensor as T
	import theano.tensor.signal.downsample as Tsd

	import nnet0207 as nnet


	class Layer():

	# afunc: activation function (see nnet)
	# Xdim: dimension of the input image ( Xrow, Xcol )
	# Xnch: number of the input channels
	# Wdim: dimension of the convolution filters ( Wrow, Wcol )
	# Wnch: number of the filter channels
	# ds: downsampling scale for max-pooling ( ds_vertical, ds_horizontal )
	# Wini_range: parameter for weight initialization (see nnet)
	#
	def __init__( self, afunc, Xdim, Xnch, Wdim, Wnch, ds, Wini_range ):


	# parameters of the input
	Xrow, Xcol = Xdim
	Xshape = ( Xnch, Xrow, Xcol )
	self.Xshape = Xshape

	# parameters of the convolution layer
	Wrow, Wcol = Wdim
	Wshape = ( Wnch, Xnch, Wrow, Wcol )
	self.Wshape = Wshape
	self.ds = ds
	Yrow, Ycol = Xrow - Wrow + 1, Xcol - Wcol + 1
	Yshape = ( Wnch, Yrow, Ycol )
	self.Yshape = Yshape

	# parameters of the pooling layer
	Zrow = int( np.ceil( float( Yrow ) / ds[0] ) )
	Zcol = int( np.ceil( float( Ycol ) / ds[1] ) )
	Zshape = ( Wnch, Zrow, Zcol )
	self.Zshape = Zshape
	self.Dout = Wnch * Zrow * Zcol

	# theano shared variables
	self.W = theano.shared( nnet.random( Wshape, Wini_range ) )
	self.dW = theano.shared( np.zeros( Wshape ) )

	# activation function of the layer
	self.afunc = nnet.d_afunc[afunc]



	def output( self, X ):

	# X: Ndat x Xnch x Xrow x Xcol
	Xshape = ( None, self.Xshape[0], self.Xshape[1], self.Xshape[2] )
	Wshape = self.Wshape
	Yconv = T.nnet.conv.conv2d( X, self.W, image_shape = Xshape, filter_shape = Wshape ) # Ndat x Wnch x Yrow x Ycol
	Ypool = Tsd.max_pool_2d( Yconv, self.ds ) # Ndat x Wnch x Zrow x Zcol
	if self.afunc == 'linear':
	Z = Ypool
	else:
	Z = self.afunc( Ypool )

	return Z
	import numpy as np
	import scipy as sp
	import mnist0117 as mnist
	import convnet_2layer0207 as convnet_2layer
	import convnet_3layer0207 as convnet_3layer


	def gendat( LT ):

	mn = mnist.MNIST( LT )
	label = mn.getLabel()
	N = label.shape[0]
	K = 10
	tmp = mn.getImage() / 255 # => in [0,1]
	X = tmp.reshape( ( tmp.shape[0], 1, tmp.shape[1], tmp.shape[2] ) )
	t = np.zeros( ( N, K ), dtype = bool )
	for ik in range( K ):
	t[label == ik, ik] = True

	return X, label, t


	def errorrate( mlp, X, t, label ):

	Y, Z = mlp.output( X )
	mnLL = np.mean( mlp.cost( Z, t ) )
	er = np.mean( label != np.argmax( Z, axis = 1 ) )

	return mnLL, er


	if __name__ == "__main__":


	np.random.seed( 0 )

	##### setting the training data & the validation data
	#
	X, label, t = gendat( 'L' )
	XL, labelL, tL = X[:50000], label[:50000], t[:50000]
	XV, labelV, tV = X[50000:], label[50000:], t[50000:]
	NL, Xnch, Xrow, Xcol = XL.shape
	NV, Xnch, Xrow, Xcol = XV.shape
	K = t.shape[1]
	Xdim = ( Xrow, Xcol )

	##### mini batch indicies for stochastic gradient ascent
	#
	idx = np.random.permutation( NL )
	batchsize = 100
	nbatch = NL / batchsize
	assert( NL % batchsize == 0 )
	idxB = np.zeros( ( nbatch, NL ), dtype = bool )
	for ib in range( nbatch ):
	idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


	##### training
	#
	W1dim, W1nch, ds1 = ( 5, 5 ), 16, ( 4, 4 )
	W2dim, W2nch, ds2 = None, None, None
	#W2dim, W2nch, ds2 = ( 5, 5 ), 16, ( 4, 4 )
	eta, mu = 0.05, 0.8
	nepoch = 50
	if W2dim == None:
	mlp = convnet_2layer.MLP( Xdim, Xnch, W1dim, W1nch, ds1, K )
	print '### 2-layer convnet'
	print '# Xdim:', Xdim, ' Xnch:', Xnch, ' W1dim:', W1dim, ' W1nch:', W1nch, ' ds1:', ds1, ' H:', mlp.H
	else:
	mlp = convnet_3layer.MLP( Xdim, Xnch, W1dim, W1nch, ds1, W2dim, W2nch, ds2, K )
	print '### 3-layer convnet'
	print '# Xdim:', Xdim, ' Xnch:', Xnch, ' W1dim:', W1dim, ' W1nch:', W1nch, ' ds1:', ds1, ' H1:', mlp.H1
	print '# W2dim:', W2dim, ' W2nch:', W2nch, ' ds2:', ds2, ' H2:', mlp.H2

	print '### training: NL = ', NL, ' NV = ', NV, ' K = ', K, ' batchsize = ', batchsize


	for i in range( nepoch ):

	# printing error rates etc.
	if i % 10 == 0:
	mnLLL, erL = errorrate( mlp, XL, tL, labelL )
	mnLLV, erV = errorrate( mlp, XV, tV, labelV )
	print i, mnLLL, erL * 100, erV * 100

	# training (selecting each batch in random order)
	for ib in np.random.permutation( nbatch ):
	ii = idxB[ib, :]
	mlp.train( XL[ii], tL[ii], eta, mu )

	i = nepoch

	mnLLL, erL = errorrate( mlp, XL, tL, labelL )
	mnLLV, erV = errorrate( mlp, XV, tV, labelV )
	print i, mnLLL, erL * 100, erV * 100


	##### setting the test data
	#
	XT, labelT, tT = gendat( 'T' )
	NT, Nstack, Xrow, Xcol = XT.shape
	print '# NT = ', NT
	mnLLT, erT = errorrate( mlp, XT, tT, labelT )
	print i, mnLLT, erL * 100, erV * 100, erT * 100