drussellmrichie/ottoANN

## ottoANN
"""
With 93/120/9 FFN with sigmoid hidden layer, and fixing the activations on
the output layer (make them sum to one, put on interval 0-1), and train for
5 epochs, only get score of 1.28
"""

import pandas as pd
import os
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
#from pybrain.structure import TanhLayer
from pybrain.structure import SigmoidLayer
from pybrain.datasets import SupervisedDataSet
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

os.chdir('/Users/russellrichie/otto')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv').drop('id',axis=1)

train.head()
train.tail()

# appears data are *not* randomized, so shuffle them to help ANN training
# or at least supposedly help training....not sure I've seen a difference in
# my testing...
train.reindex(np.random.permutation(train.index))

trainX = train.drop(['id','target'], axis=1)
trainY = train['target']

# Line below tells us whether there are missing values (there aren't)...should
# print False if there aren't any missing values
if True not in np.array(trainX.isnull()):
    "print there are no missing values!"

# see freqs of labels (most freq is not quite 10x more freq than least freq)
trainY.value_counts().plot(kind='bar')

"""
Build and train a feedforward neural network!!!!
"""

# bias in input function; hidden layer activation function is tanh
net = buildNetwork(93, 120, 9, bias=True, #hiddenclass = TanhLayer)
                                          hiddenclass = SigmoidLayer)
#net = buildNetwork(93, 200, 9, bias=False, hiddenclass = SigmoidLayer)
trainYvectorized = pd.get_dummies(trainY, prefix= 'target')
trainDs = SupervisedDataSet(93, 9)
for index, row in trainX.iterrows():
    trainDs.addSample(row, trainYvectorized.ix[index])

trainer = BackpropTrainer(net, trainDs)
#trainer.trainUntilConvergence()
epochNumb = 5 # two epochs was better than one, but three was not better than two...
for epochInd in range(epochNumb):
    print "current epoch number is {}".format(epochInd)
    trainer.train()

# now try to classify test set, and see what kind of error we get on kaggle
# there must be a way to just directly make a dataset instance from a
# df/array-like, but i haven't figured it out....
#testDs = ClassificationDataSet(93)
predictions = np.zeros(shape=(len(test),9))
for index, row in test.iterrows():
    #testDs.addSample(row)
    predictions[index,:] = net.activate(row)

# now, since the competition requires I output probabilities, we need to make
# them all between 0 and 1, and make each input's corresponding outputs sum to
# 1.
predictions = predictions.clip(0,1)
for index, prediction in enumerate(predictions):
    predictions[index] = prediction / sum(prediction)
	"""
	With 93/120/9 FFN with sigmoid hidden layer, and fixing the activations on
	the output layer (make them sum to one, put on interval 0-1), and train for
	5 epochs, only get score of 1.28
	"""

	import pandas as pd
	import os
	from pybrain.tools.shortcuts import buildNetwork
	from pybrain.supervised.trainers import BackpropTrainer
	#from pybrain.structure import TanhLayer
	from pybrain.structure import SigmoidLayer
	from pybrain.datasets import SupervisedDataSet
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import cross_validation

	os.chdir('/Users/russellrichie/otto')

	train = pd.read_csv('train.csv')
	test = pd.read_csv('test.csv').drop('id',axis=1)

	train.head()
	train.tail()

	# appears data are not randomized, so shuffle them to help ANN training
	# or at least supposedly help training....not sure I've seen a difference in
	# my testing...
	train.reindex(np.random.permutation(train.index))

	trainX = train.drop(['id','target'], axis=1)
	trainY = train['target']

	# Line below tells us whether there are missing values (there aren't)...should
	# print False if there aren't any missing values
	if True not in np.array(trainX.isnull()):
	"print there are no missing values!"

	# see freqs of labels (most freq is not quite 10x more freq than least freq)
	trainY.value_counts().plot(kind='bar')

	"""
	Build and train a feedforward neural network!!!!
	"""

	# bias in input function; hidden layer activation function is tanh
	net = buildNetwork(93, 120, 9, bias=True, #hiddenclass = TanhLayer)
	hiddenclass = SigmoidLayer)
	#net = buildNetwork(93, 200, 9, bias=False, hiddenclass = SigmoidLayer)
	trainYvectorized = pd.get_dummies(trainY, prefix= 'target')
	trainDs = SupervisedDataSet(93, 9)
	for index, row in trainX.iterrows():
	trainDs.addSample(row, trainYvectorized.ix[index])

	trainer = BackpropTrainer(net, trainDs)
	#trainer.trainUntilConvergence()
	epochNumb = 5 # two epochs was better than one, but three was not better than two...
	for epochInd in range(epochNumb):
	print "current epoch number is {}".format(epochInd)
	trainer.train()

	# now try to classify test set, and see what kind of error we get on kaggle
	# there must be a way to just directly make a dataset instance from a
	# df/array-like, but i haven't figured it out....
	#testDs = ClassificationDataSet(93)
	predictions = np.zeros(shape=(len(test),9))
	for index, row in test.iterrows():
	#testDs.addSample(row)
	predictions[index,:] = net.activate(row)

	# now, since the competition requires I output probabilities, we need to make
	# them all between 0 and 1, and make each input's corresponding outputs sum to
	# 1.
	predictions = predictions.clip(0,1)
	for index, prediction in enumerate(predictions):
	predictions[index] = prediction / sum(prediction)