tomokishii/adult_LR_classifier.py

## adult_LR_classifier.py
#
#  adult_LR_classifier.py    date. 10/17/2015
#    SGD (Stochastic Gradient Descent) version
#    consider 3 features of dataset
#

import numpy as np
import numpy.random as rng
import matplotlib.pyplot as plt
import scipy as sp
import scipy.optimize as spo
import pandas as pd
import timeit

import theano
import theano.tensor as T

def load_data():
    def to_float(i):
        return float(i)

    def is_rich(labelstr):
        if '>50K' in labelstr:
            res = 1.0
        elif '<=50K' in labelstr:
            res = 0.0
        else:
            res = -1.0

        return res

    def to_fami_size(fami_str):
        fami_str = str.lstrip(fami_str)
        if fami_str in ('Wife', 'Husband'):
            res = 2.0
        elif fami_str in 'Own-child':
            res = 4.0
        elif fami_str in 'Other-relative':
            res = 2.0
        elif fami_str in ('Not-in-family', 'Unmarried'):
            res = 1.0
        else:
            res = 0.0

        return res

    colnames = ['age', 'wc', 'dmy1', 'educ', 'edu_num', 'marital', 'occup',
                 'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native',
                 'incom']
    mydf = pd.read_csv('adult.data', header=None, names=colnames)
    mydf.dropna(inplace=True)

    xmat = np.column_stack((mydf['edu_num'].apply(to_float).values,
                            mydf['relat'].apply(to_fami_size),
                            mydf['hrs'].apply(to_float).values))
    ymat = mydf['incom'].apply(is_rich).values

    return xmat, ymat        # shape: xmat [m, ], ymat [m, ]


def setup_data(xmat, ymat):
    # store the data into 'shared' variables to be accessible by Theano
    def shared_dataset(xm, ym, borrow=True):
        shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX),
                                        borrow=borrow)
        shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX),
                                        borrow=borrow)
        #
        return shared_x, shared_y

    def data_shuffle(xm, ym, siz):
        idv = np.arange(siz)
        idv0 = np.array(idv)    # copy numbers
        np.random.shuffle(idv)

        xm[idv0] = xm[idv]
        ym[idv0] = ym[idv]
        x_new = np.zeros_like(xm)
        y_new = np.zeros_like(ym)
        x_new[idv0] = xm[idv]
        y_new[idv0] = ym[idv]

        return x_new, y_new

    total_len = ymat.shape[0]
    n_features = np.size(xmat) / total_len
    # Random Shuffle
    xmat, ymat = data_shuffle(xmat, ymat, total_len)
    train_len = int(total_len * 0.7)
    test_len = total_len - train_len

    xtr, ytr = shared_dataset(
                  (xmat[:train_len]).reshape(train_len, n_features),
                   ymat[:train_len])
    xte, yte = shared_dataset(
                  (xmat[train_len:]).reshape(test_len, n_features),
                   ymat[train_len:])

    rval = [(xtr, ytr), (xte, yte)]
    return rval

if __name__ == "__main__":
    np.random.seed(20151017)
    xmat, ymat = load_data()

    datasets = setup_data(xmat, ymat)
    xtr, ytr = datasets[0]     # xtr, ytr are vector
    xte, yte = datasets[1]     # xte, yte are vector

    # Declare Theano symbolic variables
    xtr_nrow, xtr_ncol = (xtr.get_value()).shape
    index = T.lscalar()  # index to a [mini]batch
    batch_size = T.lscalar()
    learning_rate = T.scalar()
    x = T.matrix('x')
    y = T.vector('y')

    w = theano.shared(np.zeros(xtr_ncol), name='w')    # w, b <- all zero
    b = theano.shared(0., name='b')

    print ' Initial model: '
    wi = w.get_value()
    bi = w.get_value()
    np.set_printoptions(precision=4)
    print 'w : ', wi, 'b : ', bi

    myp = T.nnet.sigmoid(T.dot(x, w) + b)
    prediction = myp > 0.5                 # from theano tutorial

    xent = T.nnet.binary_crossentropy(myp, y)
    cost = xent.mean() + 0.01 * (w ** 2).sum()  # regularization
    gw, gb = T.grad(cost, [w, b])
    #############################################
    batch_size = 50
    #############################################

    # Compile
    train_model = theano.function(
          inputs=[index, learning_rate],
          outputs=[cost, prediction],
          updates=((w, w - learning_rate * gw), (b, b - learning_rate * gb)),
          givens=[(x, xtr[index * batch_size:(index + 1) * batch_size]),
                  (y, ytr[index * batch_size:(index + 1) * batch_size])],
          allow_input_downcast=True
    )
    predict = theano.function(
          # inputs=[x],
          inputs=[],
          outputs=prediction,
          givens=[(x, xte)],
          allow_input_downcast=True
    )

    # Train (Optimization)
    start_time = timeit.default_timer()
    n_epochs = 50
    epoch = 0
    lrate_base = 0.01
    lrate_coef = 20
    n_train_batches = int(ytr.get_value(borrow=True).shape[0] / batch_size)

    while (epoch < n_epochs):
        epoch += 1
        for mini_batch_index in range(n_train_batches):
            l_rate = lrate_base * lrate_coef / (epoch + lrate_coef)
            cost_j, pred = train_model(mini_batch_index, l_rate)

        print 'epoch[%3d] : cost =%f ' % (epoch, cost_j)

    # Print result
    print '\n Final model: '
    wf = w.get_value()
    bf = b.get_value()
    np.set_printoptions(precision=4)
    print 'w : ', wf, 'b : ', bf
    print 'Elapsed time: %10.3f [s]' % (timeit.default_timer() - start_time)

    mypred = (predict()).flatten()
    iv_yte = (yte.get_value()).astype(int)
    accu = (mypred==iv_yte).astype(int)
    accu = accu.sum() *1.0 / iv_yte.shape[0]
    print 'accuracy = %12.4f ' % accu


## adult_MLP_classifier.py
#
#   adult_data_classifier.py    date. 2/4/2016
#

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import numpy as np
import numpy.random as rng
# import matplotlib.pyplot as plt
import pandas as pd
import timeit

import theano
import theano.tensor as T

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def load_data(filename='adult.data'):
    '''
      Load "Adult" data set. It has 13 features and 1 label.

      Features:
        1. age: continuous.
        (2.) workclass: Private, Self-emp-not-inc, Self-emp-inc, ...
        3. fnlwgt: continuous.
        4. education: Bachelors, Some-college, 11th, HS-grad, Prof-school,
           Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th,
           Doctorate, 5th-6th, Preschool.
        5. education-num: continuous.
        6. marital-status: Married-civ-spouse, Divorced, Never-married,
           Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
        (7.) occupation: Tech-support, Craft-repair, Other-service, ...
        8. relationship: Wife, Own-child, Husband, Not-in-family,
           Other-relative, Unmarried.
        9. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
        10. sex: Female, Male.
        11. capital-gain: continuous.
        12. capital-loss: continuous.
        13. hours-per-week: continuous.
        (14.) native-country: United-States, Cambodia, England, ...

      Label:
        >50K, <=50K.
    '''

    def to_float(i):
        # to process continuous data including integer
        return float(i)

    def is_rich(labelstr):
        # to process "Label"
        if '>50K' in labelstr:
            res = 1.0
        elif '<=50K' in labelstr:
            res = 0.0
        else:
            res = -1.  # set error code

        return res

    def edu_type(edu_str):
        # to process 'education' feature
        edu_type_names = ['Bachelors', 'Some-college', '11th', 'HS-grad',
            'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th',
            '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th',
            'Preschool']
        edu_str = edu_str.strip(' ')
        try:
            res = edu_type_names.index(edu_str)
        except:
            res = -1
        res = float(res)

        return res

    def marital_status(mar_str):
        # to process 'marial-status' feature
        mar_type_names = ['Married-civ-spouse', 'Divorced', 'Never-married',
            'Separated', 'Widowed', 'Married-spouse-absent',
            'Married-AF-spouse']
        mar_str = mar_str.strip(' ')
        try:
            res = mar_type_names.index(mar_str)
        except:
            res = -1
        res = float(res)

        return res

    def to_fami_size(fami_str):
        # to process 'relationship' feature
        fami_str = fami_str.strip(' ')
        if fami_str in ('Wife', 'Husband'):
            res = 2.0
        elif fami_str in 'Own-child':
            res = 4.0
        elif fami_str in 'Other-relative':
            res = 2.0
        elif fami_str in ('Not-in-family', 'Unmarried'):
            res = 1.0
        else:
            res = -1.

        return res

    def race_type(race_str):
        race_names = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
            'Other', 'Black']
        race_str = race_str.strip(' ')
        try:
            res = race_names.index(race_str)
        except:
            res = -1
        res = float(res)

        return res

    def sex_type(sex_str):
        sex_names = ['Female', 'Male']
        sex_str = sex_str.strip(' ')
        try:
            res = sex_names.index(sex_str)
        except:
            res = -1
        res = float(res)

        return res

    colnames = ['age', 'wc', 'flnwgt', 'educ', 'edu_num', 'marital', 'occup',
                 'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native',
                 'incom']
    mydf = pd.read_csv(filename, header=None, names=colnames)
    mydf.dropna(inplace=True)

    mydf['adclass'] = mydf['incom'].apply(is_rich)
    ymat = mydf['adclass'].values

    xmat = np.zeros((len(ymat), 11))
    reindex_key = [0, -1, 1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1]
    for i in range(len(colnames)-1):
        ikey = reindex_key[i]
        continuous_feat_list = ['age', 'flnwgt', 'edu_num', 'cap_g',
            'cap_l', 'hrs'
        ]
        if colnames[i] == 'wc':
            pass
        elif colnames[i] == 'educ':
            xmat[:, ikey] = mydf['educ'].apply(edu_type)
        elif colnames[i] == 'marital':
            xmat[:, ikey] = mydf['marital'].apply(marital_status)
        elif colnames[i] == 'occup':
            pass
        elif colnames[i] == 'relat':
            xmat[:, ikey] = mydf['relat'].apply(to_fami_size)
        elif colnames[i] == 'race':
            xmat[:, ikey] = mydf['race'].apply(race_type)
        elif colnames[i] == 'sex':
            xmat[:, ikey] = mydf['sex'].apply(sex_type)
        elif colnames[i] == 'native':
            pass
        elif colnames[i] in continuous_feat_list:
            xmat[:, ikey] = mydf[(colnames[i])].values


    return xmat, ymat

def setup_data(xmat, ymat):
    # store the data into 'shared' variables to be accessible by Theano
    def shared_dataset(xm, ym, borrow=True):
        shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX),
                                        borrow=borrow)
        shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX),
                                        borrow=borrow)
        #
        return shared_x, shared_y

    def data_shuffle(xm, ym, siz):
        idv = np.arange(siz)
        idv0 = np.array(idv)    # copy numbers
        np.random.shuffle(idv)

        xm[idv0] = xm[idv]
        ym[idv0] = ym[idv]
        x_new = np.zeros_like(xm)
        y_new = np.zeros_like(ym)
        x_new[idv0] = xm[idv]
        y_new[idv0] = ym[idv]

        return x_new, y_new

    total_len = ymat.shape[0]
    n_features = int(np.size(xmat) / total_len)
    # Random Shuffle
    xmat, ymat = data_shuffle(xmat, ymat, total_len)
    xret, yret = shared_dataset(xmat.reshape((total_len, n_features)), ymat)

    return xret, yret


# Hidden Layer
class HiddenLayer(object):
    def __init__(self, input, n_in, n_out):
        self.input = input

        w_h = theano.shared(floatX(np.random.standard_normal([n_in, n_out]))
                             * 0.05)
        b_h = theano.shared(floatX(np.zeros(n_out)))

        self.w = w_h
        self.b = b_h
        self.params = [self.w, self.b]

    def output(self):
        linarg = T.dot(self.input, self.w) + self.b
        self.output = T.nnet.sigmoid(linarg)

        return self.output

# Read-out Layer
class ReadOutLayerBin(object):
    def __init__(self, input, n_in, n_out):
        self.input = input

        w_o = theano.shared(floatX(np.random.standard_normal([n_in,n_out]))
                             * 0.05)
        b_o = theano.shared(floatX(np.zeros(n_out)))

        self.w = w_o
        self.b = b_o
        self.params = [self.w, self.b]

    def output(self):
        linarg = T.dot(self.input, self.w) + self.b
        self.output = T.nnet.sigmoid(linarg)

        return self.output

# Optimizers - GradientDescent, AdaGrad
class Optimizer(object):
    def __init__(self, params, learning_rate=0.01):
        self.lr = learning_rate
        self.params = params

    def minimize(self, loss):
        self.gradparams = [T.grad(loss, param) for param in params]

    def update_learning_rate(self, learning_rate):
        self.lr = learning_rate


class GradientDescentOptimizer(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        super(GradientDescentOptimizer, self).__init__(params, learning_rate)

    def minimize(self, loss):
        super(GradientDescentOptimizer, self).minimize(loss)
        updates = [
            (param_i, param_i - self.lr * grad_i)
            for param_i, grad_i in zip(self.params, self.gradparams)
        ]

        return updates

    def update_learning_rate(self, l_rate):
        super(GradientDescentOptimizer, self).update_learning_rate(l_rate)
        updates = [
            (param_i, param_i - self.lr * grad_i)
            for param_i, grad_i in zip(self.params, self.gradparams)
        ]

        return updates


if __name__ == '__main__':
    np.random.seed(seed=20160204)
    trX, trY = load_data('adult.data')
    teX, teY = load_data('adult.test')
    trXs, trYs = setup_data(trX, trY)
    teXs, teYs = setup_data(teX, teY)

    # Declare Theano symbolic variables
    index = T.lscalar()
    batch_size = T.lscalar()
    learning_rate = T.scalar()
    x = T.matrix('x')
    y_ = T.vector('y')

    # Define MLP network structure
    h_layer1 = HiddenLayer(input=x, n_in=11, n_out=22)
    h_layer2 = HiddenLayer(input=h_layer1.output(), n_in=22, n_out=20)
    o_layer = ReadOutLayerBin(input=h_layer2.output(), n_in=20, n_out=1)

    params = h_layer1.params + h_layer2.params + o_layer.params

    # Cost Function basic term
    hypo = (o_layer.output()).flatten()
    prediction = hypo > 0.5
    iy_ = T.cast(y_, dtype='int32')
    accur = T.mean(T.eq(prediction, iy_))
    xent = -y_ * T.log(hypo) - (1-y_) * T.log(1-hypo)

    # Regularization terms (weight decay)
    L2_sqr = ((h_layer1.w **2).sum()
              + (h_layer2.w **2).sum()
              + (o_layer.w **2).sum())
    cost = xent.mean() + 0.01 * L2_sqr

    # Train
    myoptimizer = GradientDescentOptimizer(params, learning_rate=0.01)
    one_update = myoptimizer.minimize(cost)
    #############################################
    batch_size = 50
    #############################################

    # Compile
    train_model = theano.function(
        inputs=[index],
        outputs=[cost, accur],
        updates=one_update,
        givens=[(x, trXs[index * batch_size:(index + 1) * batch_size]),
                (y_, trYs[index * batch_size:(index + 1) * batch_size])],
        allow_input_downcast=True
    )

    accuracy = theano.function(
        inputs=[],
        outputs=accur,
        givens=[(x, teXs), (y_, teYs)],
        allow_input_downcast=True
    )


    # Train (Optimization)
    start_time = timeit.default_timer()

    n_epochs = 50
    epoch = 0

    n_train_batches = int(trY.shape[0] / batch_size)

    while (epoch < n_epochs):
        epoch += 1
        for mini_batch_index in range(n_train_batches):
            cost_j, accur = train_model(mini_batch_index)

        print('epoch[%3d] : cost =%8.4f' % (epoch, cost_j))

    elapsed_time = timeit.default_timer() - start_time
    print('Elapsed time: %10.3f [s]' % elapsed_time)

    last_accur = accuracy()
    print('Accuracy = %10.3f ' % last_accur)
	#
	# adult_LR_classifier.py date. 10/17/2015
	# SGD (Stochastic Gradient Descent) version
	# consider 3 features of dataset
	#

	import numpy as np
	import numpy.random as rng
	import matplotlib.pyplot as plt
	import scipy as sp
	import scipy.optimize as spo
	import pandas as pd
	import timeit

	import theano
	import theano.tensor as T

	def load_data():
	def to_float(i):
	return float(i)

	def is_rich(labelstr):
	if '>50K' in labelstr:
	res = 1.0
	elif '<=50K' in labelstr:
	res = 0.0
	else:
	res = -1.0

	return res

	def to_fami_size(fami_str):
	fami_str = str.lstrip(fami_str)
	if fami_str in ('Wife', 'Husband'):
	res = 2.0
	elif fami_str in 'Own-child':
	res = 4.0
	elif fami_str in 'Other-relative':
	res = 2.0
	elif fami_str in ('Not-in-family', 'Unmarried'):
	res = 1.0
	else:
	res = 0.0

	return res

	colnames = ['age', 'wc', 'dmy1', 'educ', 'edu_num', 'marital', 'occup',
	'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native',
	'incom']
	mydf = pd.read_csv('adult.data', header=None, names=colnames)
	mydf.dropna(inplace=True)

	xmat = np.column_stack((mydf['edu_num'].apply(to_float).values,
	mydf['relat'].apply(to_fami_size),
	mydf['hrs'].apply(to_float).values))
	ymat = mydf['incom'].apply(is_rich).values

	return xmat, ymat # shape: xmat [m, ], ymat [m, ]


	def setup_data(xmat, ymat):
	# store the data into 'shared' variables to be accessible by Theano
	def shared_dataset(xm, ym, borrow=True):
	shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX),
	borrow=borrow)
	shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX),
	borrow=borrow)
	#
	return shared_x, shared_y

	def data_shuffle(xm, ym, siz):
	idv = np.arange(siz)
	idv0 = np.array(idv) # copy numbers
	np.random.shuffle(idv)

	xm[idv0] = xm[idv]
	ym[idv0] = ym[idv]
	x_new = np.zeros_like(xm)
	y_new = np.zeros_like(ym)
	x_new[idv0] = xm[idv]
	y_new[idv0] = ym[idv]

	return x_new, y_new

	total_len = ymat.shape[0]
	n_features = np.size(xmat) / total_len
	# Random Shuffle
	xmat, ymat = data_shuffle(xmat, ymat, total_len)
	train_len = int(total_len * 0.7)
	test_len = total_len - train_len

	xtr, ytr = shared_dataset(
	(xmat[:train_len]).reshape(train_len, n_features),
	ymat[:train_len])
	xte, yte = shared_dataset(
	(xmat[train_len:]).reshape(test_len, n_features),
	ymat[train_len:])

	rval = [(xtr, ytr), (xte, yte)]
	return rval

	if __name__ == "__main__":
	np.random.seed(20151017)
	xmat, ymat = load_data()

	datasets = setup_data(xmat, ymat)
	xtr, ytr = datasets[0] # xtr, ytr are vector
	xte, yte = datasets[1] # xte, yte are vector

	# Declare Theano symbolic variables
	xtr_nrow, xtr_ncol = (xtr.get_value()).shape
	index = T.lscalar() # index to a [mini]batch
	batch_size = T.lscalar()
	learning_rate = T.scalar()
	x = T.matrix('x')
	y = T.vector('y')

	w = theano.shared(np.zeros(xtr_ncol), name='w') # w, b <- all zero
	b = theano.shared(0., name='b')

	print ' Initial model: '
	wi = w.get_value()
	bi = w.get_value()
	np.set_printoptions(precision=4)
	print 'w : ', wi, 'b : ', bi

	myp = T.nnet.sigmoid(T.dot(x, w) + b)
	prediction = myp > 0.5 # from theano tutorial

	xent = T.nnet.binary_crossentropy(myp, y)
	cost = xent.mean() + 0.01 * (w ** 2).sum() # regularization
	gw, gb = T.grad(cost, [w, b])
	#############################################
	batch_size = 50
	#############################################

	# Compile
	train_model = theano.function(
	inputs=[index, learning_rate],
	outputs=[cost, prediction],
	updates=((w, w - learning_rate * gw), (b, b - learning_rate * gb)),
	givens=[(x, xtr[index * batch_size:(index + 1) * batch_size]),
	(y, ytr[index * batch_size:(index + 1) * batch_size])],
	allow_input_downcast=True
	)
	predict = theano.function(
	# inputs=[x],
	inputs=[],
	outputs=prediction,
	givens=[(x, xte)],
	allow_input_downcast=True
	)

	# Train (Optimization)
	start_time = timeit.default_timer()
	n_epochs = 50
	epoch = 0
	lrate_base = 0.01
	lrate_coef = 20
	n_train_batches = int(ytr.get_value(borrow=True).shape[0] / batch_size)

	while (epoch < n_epochs):
	epoch += 1
	for mini_batch_index in range(n_train_batches):
	l_rate = lrate_base * lrate_coef / (epoch + lrate_coef)
	cost_j, pred = train_model(mini_batch_index, l_rate)

	print 'epoch[%3d] : cost =%f ' % (epoch, cost_j)

	# Print result
	print '\n Final model: '
	wf = w.get_value()
	bf = b.get_value()
	np.set_printoptions(precision=4)
	print 'w : ', wf, 'b : ', bf
	print 'Elapsed time: %10.3f [s]' % (timeit.default_timer() - start_time)

	mypred = (predict()).flatten()
	iv_yte = (yte.get_value()).astype(int)
	accu = (mypred==iv_yte).astype(int)
	accu = accu.sum() *1.0 / iv_yte.shape[0]
	print 'accuracy = %12.4f ' % accu
	#
	# adult_data_classifier.py date. 2/4/2016
	#

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import sys
	import numpy as np
	import numpy.random as rng
	# import matplotlib.pyplot as plt
	import pandas as pd
	import timeit

	import theano
	import theano.tensor as T

	def floatX(X):
	return np.asarray(X, dtype=theano.config.floatX)

	def load_data(filename='adult.data'):
	'''
	Load "Adult" data set. It has 13 features and 1 label.

	Features:
	1. age: continuous.
	(2.) workclass: Private, Self-emp-not-inc, Self-emp-inc, ...
	3. fnlwgt: continuous.
	4. education: Bachelors, Some-college, 11th, HS-grad, Prof-school,
	Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th,
	Doctorate, 5th-6th, Preschool.
	5. education-num: continuous.
	6. marital-status: Married-civ-spouse, Divorced, Never-married,
	Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
	(7.) occupation: Tech-support, Craft-repair, Other-service, ...
	8. relationship: Wife, Own-child, Husband, Not-in-family,
	Other-relative, Unmarried.
	9. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
	10. sex: Female, Male.
	11. capital-gain: continuous.
	12. capital-loss: continuous.
	13. hours-per-week: continuous.
	(14.) native-country: United-States, Cambodia, England, ...

	Label:
	>50K, <=50K.
	'''

	def to_float(i):
	# to process continuous data including integer
	return float(i)

	def is_rich(labelstr):
	# to process "Label"
	if '>50K' in labelstr:
	res = 1.0
	elif '<=50K' in labelstr:
	res = 0.0
	else:
	res = -1. # set error code

	return res

	def edu_type(edu_str):
	# to process 'education' feature
	edu_type_names = ['Bachelors', 'Some-college', '11th', 'HS-grad',
	'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th',
	'12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th',
	'Preschool']
	edu_str = edu_str.strip(' ')
	try:
	res = edu_type_names.index(edu_str)
	except:
	res = -1
	res = float(res)

	return res

	def marital_status(mar_str):
	# to process 'marial-status' feature
	mar_type_names = ['Married-civ-spouse', 'Divorced', 'Never-married',
	'Separated', 'Widowed', 'Married-spouse-absent',
	'Married-AF-spouse']
	mar_str = mar_str.strip(' ')
	try:
	res = mar_type_names.index(mar_str)
	except:
	res = -1
	res = float(res)

	return res

	def to_fami_size(fami_str):
	# to process 'relationship' feature
	fami_str = fami_str.strip(' ')
	if fami_str in ('Wife', 'Husband'):
	res = 2.0
	elif fami_str in 'Own-child':
	res = 4.0
	elif fami_str in 'Other-relative':
	res = 2.0
	elif fami_str in ('Not-in-family', 'Unmarried'):
	res = 1.0
	else:
	res = -1.

	return res

	def race_type(race_str):
	race_names = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
	'Other', 'Black']
	race_str = race_str.strip(' ')
	try:
	res = race_names.index(race_str)
	except:
	res = -1
	res = float(res)

	return res

	def sex_type(sex_str):
	sex_names = ['Female', 'Male']
	sex_str = sex_str.strip(' ')
	try:
	res = sex_names.index(sex_str)
	except:
	res = -1
	res = float(res)

	return res

	colnames = ['age', 'wc', 'flnwgt', 'educ', 'edu_num', 'marital', 'occup',
	'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native',
	'incom']
	mydf = pd.read_csv(filename, header=None, names=colnames)
	mydf.dropna(inplace=True)

	mydf['adclass'] = mydf['incom'].apply(is_rich)
	ymat = mydf['adclass'].values

	xmat = np.zeros((len(ymat), 11))
	reindex_key = [0, -1, 1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1]
	for i in range(len(colnames)-1):
	ikey = reindex_key[i]
	continuous_feat_list = ['age', 'flnwgt', 'edu_num', 'cap_g',
	'cap_l', 'hrs'
	]
	if colnames[i] == 'wc':
	pass
	elif colnames[i] == 'educ':
	xmat[:, ikey] = mydf['educ'].apply(edu_type)
	elif colnames[i] == 'marital':
	xmat[:, ikey] = mydf['marital'].apply(marital_status)
	elif colnames[i] == 'occup':
	pass
	elif colnames[i] == 'relat':
	xmat[:, ikey] = mydf['relat'].apply(to_fami_size)
	elif colnames[i] == 'race':
	xmat[:, ikey] = mydf['race'].apply(race_type)
	elif colnames[i] == 'sex':
	xmat[:, ikey] = mydf['sex'].apply(sex_type)
	elif colnames[i] == 'native':
	pass
	elif colnames[i] in continuous_feat_list:
	xmat[:, ikey] = mydf[(colnames[i])].values


	return xmat, ymat

	def setup_data(xmat, ymat):
	# store the data into 'shared' variables to be accessible by Theano
	def shared_dataset(xm, ym, borrow=True):
	shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX),
	borrow=borrow)
	shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX),
	borrow=borrow)
	#
	return shared_x, shared_y

	def data_shuffle(xm, ym, siz):
	idv = np.arange(siz)
	idv0 = np.array(idv) # copy numbers
	np.random.shuffle(idv)

	xm[idv0] = xm[idv]
	ym[idv0] = ym[idv]
	x_new = np.zeros_like(xm)
	y_new = np.zeros_like(ym)
	x_new[idv0] = xm[idv]
	y_new[idv0] = ym[idv]

	return x_new, y_new

	total_len = ymat.shape[0]
	n_features = int(np.size(xmat) / total_len)
	# Random Shuffle
	xmat, ymat = data_shuffle(xmat, ymat, total_len)
	xret, yret = shared_dataset(xmat.reshape((total_len, n_features)), ymat)

	return xret, yret


	# Hidden Layer
	class HiddenLayer(object):
	def __init__(self, input, n_in, n_out):
	self.input = input

	w_h = theano.shared(floatX(np.random.standard_normal([n_in, n_out]))
	* 0.05)
	b_h = theano.shared(floatX(np.zeros(n_out)))

	self.w = w_h
	self.b = b_h
	self.params = [self.w, self.b]

	def output(self):
	linarg = T.dot(self.input, self.w) + self.b
	self.output = T.nnet.sigmoid(linarg)

	return self.output

	# Read-out Layer
	class ReadOutLayerBin(object):
	def __init__(self, input, n_in, n_out):
	self.input = input

	w_o = theano.shared(floatX(np.random.standard_normal([n_in,n_out]))
	* 0.05)
	b_o = theano.shared(floatX(np.zeros(n_out)))

	self.w = w_o
	self.b = b_o
	self.params = [self.w, self.b]

	def output(self):
	linarg = T.dot(self.input, self.w) + self.b
	self.output = T.nnet.sigmoid(linarg)

	return self.output

	# Optimizers - GradientDescent, AdaGrad
	class Optimizer(object):
	def __init__(self, params, learning_rate=0.01):
	self.lr = learning_rate
	self.params = params

	def minimize(self, loss):
	self.gradparams = [T.grad(loss, param) for param in params]

	def update_learning_rate(self, learning_rate):
	self.lr = learning_rate


	class GradientDescentOptimizer(Optimizer):
	def __init__(self, params, learning_rate=0.01):
	super(GradientDescentOptimizer, self).__init__(params, learning_rate)

	def minimize(self, loss):
	super(GradientDescentOptimizer, self).minimize(loss)
	updates = [
	(param_i, param_i - self.lr * grad_i)
	for param_i, grad_i in zip(self.params, self.gradparams)
	]

	return updates

	def update_learning_rate(self, l_rate):
	super(GradientDescentOptimizer, self).update_learning_rate(l_rate)
	updates = [
	(param_i, param_i - self.lr * grad_i)
	for param_i, grad_i in zip(self.params, self.gradparams)
	]

	return updates


	if __name__ == '__main__':
	np.random.seed(seed=20160204)
	trX, trY = load_data('adult.data')
	teX, teY = load_data('adult.test')
	trXs, trYs = setup_data(trX, trY)
	teXs, teYs = setup_data(teX, teY)

	# Declare Theano symbolic variables
	index = T.lscalar()
	batch_size = T.lscalar()
	learning_rate = T.scalar()
	x = T.matrix('x')
	y_ = T.vector('y')

	# Define MLP network structure
	h_layer1 = HiddenLayer(input=x, n_in=11, n_out=22)
	h_layer2 = HiddenLayer(input=h_layer1.output(), n_in=22, n_out=20)
	o_layer = ReadOutLayerBin(input=h_layer2.output(), n_in=20, n_out=1)

	params = h_layer1.params + h_layer2.params + o_layer.params

	# Cost Function basic term
	hypo = (o_layer.output()).flatten()
	prediction = hypo > 0.5
	iy_ = T.cast(y_, dtype='int32')
	accur = T.mean(T.eq(prediction, iy_))
	xent = -y_ * T.log(hypo) - (1-y_) * T.log(1-hypo)

	# Regularization terms (weight decay)
	L2_sqr = ((h_layer1.w **2).sum()
	+ (h_layer2.w **2).sum()
	+ (o_layer.w **2).sum())
	cost = xent.mean() + 0.01 * L2_sqr

	# Train
	myoptimizer = GradientDescentOptimizer(params, learning_rate=0.01)
	one_update = myoptimizer.minimize(cost)
	#############################################
	batch_size = 50
	#############################################

	# Compile
	train_model = theano.function(
	inputs=[index],
	outputs=[cost, accur],
	updates=one_update,
	givens=[(x, trXs[index * batch_size:(index + 1) * batch_size]),
	(y_, trYs[index * batch_size:(index + 1) * batch_size])],
	allow_input_downcast=True
	)

	accuracy = theano.function(
	inputs=[],
	outputs=accur,
	givens=[(x, teXs), (y_, teYs)],
	allow_input_downcast=True
	)


	# Train (Optimization)
	start_time = timeit.default_timer()

	n_epochs = 50
	epoch = 0

	n_train_batches = int(trY.shape[0] / batch_size)

	while (epoch < n_epochs):
	epoch += 1
	for mini_batch_index in range(n_train_batches):
	cost_j, accur = train_model(mini_batch_index)

	print('epoch[%3d] : cost =%8.4f' % (epoch, cost_j))

	elapsed_time = timeit.default_timer() - start_time
	print('Elapsed time: %10.3f [s]' % elapsed_time)

	last_accur = accuracy()
	print('Accuracy = %10.3f ' % last_accur)