takatakamanbou/logreg_mnist0130.py Secret

## logreg_mnist0130.py
import numpy as np
import scipy as sp
import mnist0117 as mnist

def gendat( LT ):

    mn = mnist.MNIST( LT )
    label = mn.getLabel()
    N = label.shape[0]
    K = 10
    X = mn.getImage().reshape( ( N, -1 ) ) / 255  # => in [0,1]
    t = np.zeros( ( N, K ), dtype = bool )
    for ik in range( K ):
        t[label == ik, ik] = True

    return X, label, t


def errorrate( lg, X, t, label ):

    Y, Z  = lg.output( X )
    mnLL = np.mean( lg.negLL( Z, t ) )
    er = np.mean( label != np.argmax( Z, axis = 1 ) )

    return mnLL, er


if __name__ == "__main__":

    import logreg_theano0130 as logreg

    np.random.seed( 0 )

    ##### setting the training data & the validation data
    #
    X, label, t = gendat( 'L' )
    XL, labelL, tL = X[:50000], label[:50000], t[:50000]
    XV, labelV, tV = X[50000:], label[50000:], t[50000:]
    NL, D = XL.shape
    NV, D = XV.shape
    K = t.shape[1]


    ##### mini batch indicies for stochastic gradient ascent
    #
    idx = np.random.permutation( NL )
    batchsize = 1000
    nbatch = NL / batchsize
    assert( NL % batchsize == 0 )
    idxB = np.zeros( ( nbatch, NL ), dtype = bool )
    for ib in range( nbatch ):
        idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True

    print '### training:   NL = ', NL, 'NV = ', NV, ' D = ', D, ' K = ', K, ' batchsize = ', batchsize


    ##### training
    #
    eta    = 0.5
    mu     = 0.8
    nepoch = 100
    lg = logreg.LogisticRegression( D, K )

    for i in range( nepoch ):

        # printing error rates etc.
        if i % 10 == 0:
            mnLLL, erL = errorrate( lg, XL, tL, labelL )
            mnLLV, erV = errorrate( lg, XV, tV, labelV )
            print i, mnLLL, erL * 100, erV * 100

        # training (selecting each batch in random order)
        for ib in np.random.permutation( nbatch ):
            ii = idxB[ib, :]
            lg.train( XL[ii], tL[ii], eta, mu )

    i = nepoch

    mnLLL, erL = errorrate( lg, XL, tL, labelL )
    mnLLV, erV = errorrate( lg, XV, tV, labelV )
    print i, mnLLL, erL * 100, erV * 100


    ##### setting the test data
    #
    XT, labelT, tT = gendat( 'T' )
    NT, D = XT.shape
    print '# NT = ', NT
    mnLLT, erT = errorrate( lg, XT, tT, labelT )
    print i, mnLLT, erL * 100, erV * 100, erT * 100

## logreg_theano0130.py
import numpy as np
import theano
import theano.tensor as T

import nnet0130 as nnet


class LogisticRegression():

    def __init__( self, D, K ):

        # shared variables
        self.W = theano.shared( nnet.random( ( K, D ), 0.1 ) )
        self.b = theano.shared(  nnet.random( K, 0.1 ) )
        self.dW = theano.shared( np.zeros( ( K, D ) ) )
        self.db = theano.shared( np.zeros( K ) )

        # theano functions
        self.output = self._Tfunc_output()
        self.negLL  = self._Tfunc_negLL()
        self.train  = self._Tfunc_train()


    ### output
    #
    def _Tfunc_output( self ):

        X = T.dmatrix( 'X' )  # N x D
        Y, Z = nnet.T_softmax( self.W, self.b, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### negative log-likelihood
    #
    def _Tfunc_negLL( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        LL = nnet.T_negLL( Z, t )

        return theano.function( [ Z, t ], LL )


    ### train
    #
    def _Tfunc_train( self ):

        W, b, dW, db = self.W, self.b, self.dW, self.db
        X = T.dmatrix( 'X' )  # N x D
        t = T.dmatrix( 't' )  # N x K
        eta = T.dscalar( 'eta' )
        mu = T.dscalar( 'mu' )
        Y, Z = nnet.T_softmax( W, b, X )
        cost = T.mean( nnet.T_negLL( Z, t ) )
        gradW, gradb = T.grad( cost, [ W, b ] )

        dW_new = -eta * gradW + mu * dW
        db_new = -eta * gradb + mu * db
        W_new = W + dW_new
        b_new = b + db_new
        updatesList = [ ( W, W_new ), ( b, b_new ), ( dW, dW_new ), ( db, db_new ) ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )

## mlp_2layer0130.py
import numpy as np
import theano
import theano.tensor as T

import nnet0130 as nnet

class MLP():

    def __init__( self, D, H, K ):

        # shared variables for the 1st layer (sigmoid or relu)
        self.W1  = theano.shared( nnet.random( ( H, D ), 0.1 ) )
        self.b1  = theano.shared( nnet.random( H, 0.1 ) )
        self.dW1 = theano.shared( np.zeros( ( H, D ) ) )
        self.db1 = theano.shared( np.zeros( H ) )

        # shared variables for the 2nd layer (softmax)
        self.W2  = theano.shared( nnet.random( ( K, H ), 0.1 ) )
        self.b2  = theano.shared( nnet.random( K, 0.1 ) )
        self.dW2 = theano.shared( np.zeros( ( K, H ) ) )
        self.db2 = theano.shared( np.zeros( K ) )

        # theano functions
        self.output = self._Tfunc_output()
        self.negLL  = self._Tfunc_negLL()
        self.train  = self._Tfunc_train()


    ### output
    #
    def _Tfunc_output( self ):

        X = T.dmatrix()  # N x D
        Y, Z = _T_output( self.W1, self.b1, self.W2, self.b2, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### negative log-likelihood
    #
    def _Tfunc_negLL( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        LL = nnet.T_negLL( Z, t )

        return theano.function( [ Z, t ], LL )


    ### train
    #
    def _Tfunc_train( self ):

        W1, dW1, b1, db1 = self.W1, self.dW1, self.b1, self.db1
        W2, dW2, b2, db2 = self.W2, self.dW2, self.b2, self.db2
        X    = T.dmatrix( 'X' )  # N x D
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        Y2, Z2 = _T_output( W1, b1, W2, b2, X )
        cost = T.mean( nnet.T_negLL( Z2, t ) )
        gradW1, gradb1, gradW2, gradb2 = T.grad( cost, [ W1, b1, W2, b2 ] )

        dW1_new = -eta * gradW1 + mu * dW1
        db1_new = -eta * gradb1 + mu * db1
        dW2_new = -eta * gradW2 + mu * dW2
        db2_new = -eta * gradb2 + mu * db2
        W1_new = W1 + dW1_new
        b1_new = b1 + db1_new
        W2_new = W2 + dW2_new
        b2_new = b2 + db2_new
        updatesList = [
            ( W1, W1_new ), ( b1, b1_new ), ( dW1, dW1_new ), ( db1, db1_new ),
            ( W2, W2_new ), ( b2, b2_new ), ( dW2, dW2_new ), ( db2, db2_new ) ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )


def _T_output( W1, b1, W2, b2, X ):

    #Y1, Z1 = nnet.T_sigmoid( W1, b1, X )
    Y1, Z1 = nnet.T_relu( W1, b1, X )
    Y2, Z2 = nnet.T_softmax( W2, b2, Z1 )

    return Y2, Z2


## mlp_3layer0130.py
import numpy as np
import theano
import theano.tensor as T

import nnet0130 as nnet

class MLP():

    def __init__( self, D, H1, H2, K ):

        # shared variables for the 1st layer (sigmoid or relu)
        self.W1  = theano.shared( nnet.random( ( H1, D ), 0.1 ) )
        self.b1  = theano.shared( nnet.random( H1, 0.1 ) )
        self.dW1 = theano.shared( np.zeros( ( H1, D ) ) )
        self.db1 = theano.shared( np.zeros( H1 ) )

        # shared variables for the 2nd layer (sigmoid or relu)
        self.W2  = theano.shared( nnet.random( ( H2, H1 ), 0.1 ) )
        self.b2  = theano.shared( nnet.random( H2, 0.1 ) )
        self.dW2 = theano.shared( np.zeros( ( H2, H1 ) ) )
        self.db2 = theano.shared( np.zeros( H2 ) )

        # shared variables for the 3rd layer (softmax)
        self.W3  = theano.shared( nnet.random( ( K, H2 ), 0.1 ) )
        self.b3  = theano.shared( nnet.random( K, 0.1 ) )
        self.dW3 = theano.shared( np.zeros( ( K, H2 ) ) )
        self.db3 = theano.shared( np.zeros( K ) )

        # theano functions
        self.output = self._Tfunc_output()
        self.negLL  = self._Tfunc_negLL()
        self.train  = self._Tfunc_train()


    ### output
    #
    def _Tfunc_output( self ):

        X = T.dmatrix()  # N x D
        Y, Z = _T_output( self.W1, self.b1, self.W2, self.b2, self.W3, self.b3, X )

        return theano.function( [ X ], [ Y, Z ] )


    ### negative log-likelihood
    #
    def _Tfunc_negLL( self ):

        Z = T.dmatrix()  # N x K
        t = T.dmatrix()  # N x K
        LL = nnet.T_negLL( Z, t )

        return theano.function( [ Z, t ], LL )


    ### train
    #
    def _Tfunc_train( self ):

        W1, dW1, b1, db1 = self.W1, self.dW1, self.b1, self.db1
        W2, dW2, b2, db2 = self.W2, self.dW2, self.b2, self.db2
        W3, dW3, b3, db3 = self.W3, self.dW3, self.b3, self.db3
        X    = T.dmatrix( 'X' )  # N x D
        t    = T.dmatrix( 't' )  # N x K
        eta  = T.dscalar( 'eta' )
        mu   = T.dscalar( 'mu' )
        Y3, Z3 = _T_output( W1, b1, W2, b2, W3, b3, X )
        cost = T.mean( nnet.T_negLL( Z3, t ) )
        grad = T.grad( cost, [ W1, b1, W2, b2, W3, b3 ] )
        gradW1, gradb1, gradW2, gradb2, gradW3, gradb3 = grad

        dW1_new = -eta * gradW1 + mu * dW1
        db1_new = -eta * gradb1 + mu * db1
        dW2_new = -eta * gradW2 + mu * dW2
        db2_new = -eta * gradb2 + mu * db2
        dW3_new = -eta * gradW3 + mu * dW3
        db3_new = -eta * gradb3 + mu * db3
        W1_new = W1 + dW1_new
        b1_new = b1 + db1_new
        W2_new = W2 + dW2_new
        b2_new = b2 + db2_new
        W3_new = W3 + dW3_new
        b3_new = b3 + db3_new
        updatesList = [
            ( W1, W1_new ), ( b1, b1_new ), ( dW1, dW1_new ), ( db1, db1_new ),
            ( W2, W2_new ), ( b2, b2_new ), ( dW2, dW2_new ), ( db2, db2_new ),
            ( W3, W3_new ), ( b3, b3_new ), ( dW3, dW3_new ), ( db3, db3_new ),
        ]

        return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )


def _T_output( W1, b1, W2, b2, W3, b3, X ):

    #Y1, Z1 = nnet.T_sigmoid( W1, b1, X )
    Y1, Z1 = nnet.T_relu( W1, b1, X )
    Y2, Z2 = nnet.T_relu( W2, b2, Z1 )
    Y3, Z3 = nnet.T_softmax( W3, b3, Z2 )

    return Y3, Z3

## mnist_mlp0130.py
import numpy as np
import scipy as sp
import mnist0117 as mnist

def gendat( LT ):

    mn = mnist.MNIST( LT )
    label = mn.getLabel()
    N = label.shape[0]
    K = 10
    X = mn.getImage().reshape( ( N, -1 ) ) / 255  # => in [0,1]
    t = np.zeros( ( N, K ), dtype = bool )
    for ik in range( K ):
        t[label == ik, ik] = True

    return X, label, t


def errorrate( mlp, X, t, label ):

    Y, Z  = mlp.output( X )
    mnLL = np.mean( mlp.negLL( Z, t ) )
    er = np.mean( label != np.argmax( Z, axis = 1 ) )

    return mnLL, er


if __name__ == "__main__":

    import mlp_2layer0130 as mlp_2layer
    import mlp_3layer0130 as mlp_3layer

    np.random.seed( 0 )

    ##### setting the training data & the validation data
    #
    X, label, t = gendat( 'L' )
    XL, labelL, tL = X[:50000], label[:50000], t[:50000]
    XV, labelV, tV = X[50000:], label[50000:], t[50000:]
    NL, D = XL.shape
    NV, D = XV.shape
    K = t.shape[1]


    ##### mini batch indicies for stochastic gradient ascent
    #
    idx = np.random.permutation( NL )
    batchsize = 1000
    nbatch = NL / batchsize
    assert( NL % batchsize == 0 )
    idxB = np.zeros( ( nbatch, NL ), dtype = bool )
    for ib in range( nbatch ):
        idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True


    ##### training
    #
    #H1, H2 = 500, 0
    H1, H2 = 500, 1000
    eta    = 0.5
    mu     = 0.8
    nepoch = 20
    if H2 <= 0:
        mlp = mlp_2layer.MLP( D, H1, K )
        print '### 2-layer MLP: D = ', D, ' H = ', H1, ' K = ', K
    else:
        mlp = mlp_3layer.MLP( D, H1, H2, K )
        print '### 3-layer MLP: D = ', D, ' H1 = ', H1, ' H2 = ', H2, ' K = ', K

    print '### training:   NL = ', NL, 'NV = ', NV, ' D = ', D, ' K = ', K, ' batchsize = ', batchsize

    for i in range( nepoch ):

        # printing error rates etc.
        if i % 10 == 0:
            mnLLL, erL = errorrate( mlp, XL, tL, labelL )
            mnLLV, erV = errorrate( mlp, XV, tV, labelV )
            print i, mnLLL, erL * 100, erV * 100

        # training (selecting each batch in random order)
        for ib in np.random.permutation( nbatch ):
            ii = idxB[ib, :]
            mlp.train( XL[ii], tL[ii], eta, mu )

    i = nepoch

    mnLLL, erL = errorrate( mlp, XL, tL, labelL )
    mnLLV, erV = errorrate( mlp, XV, tV, labelV )
    print i, mnLLL, erL * 100, erV * 100


    ##### setting the test data
    #
    XT, labelT, tT = gendat( 'T' )
    NT, D = XT.shape
    print '# NT = ', NT
    mnLLT, erT = errorrate( mlp, XT, tT, labelT )
    print i, mnLLT, erL * 100, erV * 100, erT * 100

## nnet0130.py
import numpy as np
import theano
import theano.tensor as T


### random numbers for weight initialization
#
def random( shape, r ):

    # [ -r/2, r/2 )
    return r * ( np.random.random_sample( shape ) - 0.5 )


### softmax layer
#
def T_softmax( W, b, X ):

    Y = T.dot( X, W.T ) + b  # Ndat x Dout
    Z = T.nnet.softmax( Y )

    return Y, Z


### sigmoid layer
#
def T_sigmoid( W, b, X ):

    Y = T.dot( X, W.T ) + b  # Ndat x Dout
    Z = T.nnet.sigmoid( Y )

    return Y, Z


### rectified linear layer
#
def T_relu( W, b, X ):

    Y = T.dot( X, W.T ) + b  # Ndat x Dout
    Z = T.switch( Y > 0, Y, 0 )

    return Y, Z


### negative log-likelihood
#
def T_negLL( Z, t ):

    return T.nnet.categorical_crossentropy( Z, t )
	import numpy as np
	import scipy as sp
	import mnist0117 as mnist

	def gendat( LT ):

	mn = mnist.MNIST( LT )
	label = mn.getLabel()
	N = label.shape[0]
	K = 10
	X = mn.getImage().reshape( ( N, -1 ) ) / 255 # => in [0,1]
	t = np.zeros( ( N, K ), dtype = bool )
	for ik in range( K ):
	t[label == ik, ik] = True

	return X, label, t


	def errorrate( lg, X, t, label ):

	Y, Z = lg.output( X )
	mnLL = np.mean( lg.negLL( Z, t ) )
	er = np.mean( label != np.argmax( Z, axis = 1 ) )

	return mnLL, er


	if __name__ == "__main__":

	import logreg_theano0130 as logreg

	np.random.seed( 0 )

	##### setting the training data & the validation data
	#
	X, label, t = gendat( 'L' )
	XL, labelL, tL = X[:50000], label[:50000], t[:50000]
	XV, labelV, tV = X[50000:], label[50000:], t[50000:]
	NL, D = XL.shape
	NV, D = XV.shape
	K = t.shape[1]


	##### mini batch indicies for stochastic gradient ascent
	#
	idx = np.random.permutation( NL )
	batchsize = 1000
	nbatch = NL / batchsize
	assert( NL % batchsize == 0 )
	idxB = np.zeros( ( nbatch, NL ), dtype = bool )
	for ib in range( nbatch ):
	idxB[ib, idx.reshape( ( nbatch, batchsize ))[ib, :]] = True

	print '### training: NL = ', NL, 'NV = ', NV, ' D = ', D, ' K = ', K, ' batchsize = ', batchsize


	##### training
	#
	eta = 0.5
	mu = 0.8
	nepoch = 100
	lg = logreg.LogisticRegression( D, K )

	for i in range( nepoch ):

	# printing error rates etc.
	if i % 10 == 0:
	mnLLL, erL = errorrate( lg, XL, tL, labelL )
	mnLLV, erV = errorrate( lg, XV, tV, labelV )
	print i, mnLLL, erL * 100, erV * 100

	# training (selecting each batch in random order)
	for ib in np.random.permutation( nbatch ):
	ii = idxB[ib, :]
	lg.train( XL[ii], tL[ii], eta, mu )

	i = nepoch

	mnLLL, erL = errorrate( lg, XL, tL, labelL )
	mnLLV, erV = errorrate( lg, XV, tV, labelV )
	print i, mnLLL, erL * 100, erV * 100


	##### setting the test data
	#
	XT, labelT, tT = gendat( 'T' )
	NT, D = XT.shape
	print '# NT = ', NT
	mnLLT, erT = errorrate( lg, XT, tT, labelT )
	print i, mnLLT, erL * 100, erV * 100, erT * 100
	import numpy as np
	import theano
	import theano.tensor as T

	import nnet0130 as nnet


	class LogisticRegression():

	def __init__( self, D, K ):

	# shared variables
	self.W = theano.shared( nnet.random( ( K, D ), 0.1 ) )
	self.b = theano.shared( nnet.random( K, 0.1 ) )
	self.dW = theano.shared( np.zeros( ( K, D ) ) )
	self.db = theano.shared( np.zeros( K ) )

	# theano functions
	self.output = self._Tfunc_output()
	self.negLL = self._Tfunc_negLL()
	self.train = self._Tfunc_train()


	### output
	#
	def _Tfunc_output( self ):

	X = T.dmatrix( 'X' ) # N x D
	Y, Z = nnet.T_softmax( self.W, self.b, X )

	return theano.function( [ X ], [ Y, Z ] )


	### negative log-likelihood
	#
	def _Tfunc_negLL( self ):

	Z = T.dmatrix() # N x K
	t = T.dmatrix() # N x K
	LL = nnet.T_negLL( Z, t )

	return theano.function( [ Z, t ], LL )


	### train
	#
	def _Tfunc_train( self ):

	W, b, dW, db = self.W, self.b, self.dW, self.db
	X = T.dmatrix( 'X' ) # N x D
	t = T.dmatrix( 't' ) # N x K
	eta = T.dscalar( 'eta' )
	mu = T.dscalar( 'mu' )
	Y, Z = nnet.T_softmax( W, b, X )
	cost = T.mean( nnet.T_negLL( Z, t ) )
	gradW, gradb = T.grad( cost, [ W, b ] )

	dW_new = -eta * gradW + mu * dW
	db_new = -eta * gradb + mu * db
	W_new = W + dW_new
	b_new = b + db_new
	updatesList = [ ( W, W_new ), ( b, b_new ), ( dW, dW_new ), ( db, db_new ) ]

	return theano.function( [ X, t, eta, mu ], cost, updates = updatesList )