stormxuwz/1 hidden layer SGD neural network.py

## 1 hidden layer SGD neural network.py
from __future__ import division
import numpy as np
import chainer  # only for import the data at this script


def softmax(z):
	# z is a vector
    return np.exp(z) / np.sum(np.exp(z))

def sigmoid(x):
	# x can be a vector
	return 1.0/(1.0+np.exp(-x))

def sigmoid_gradient(x):
	# x can be a vector
	return sigmoid(x)*(1-sigmoid(x))

def ReLU(x):
	# x can be a vector
	return np.maximum(x,0)

def ReLU_gradient(x):
	# x can be a vector
	return 1.0*(x>0)

class myNeuralNetwork(object):

	def __init__(self,unitsPerHiddenLayer = [3], outputFunc = "softmax",hiddenLayerFunc = "ReLU",learningRate = 0.1,epochs = 3):
		'''
		# Currently, only single hidden layer is implemented
		'''
		self.hiddenLayerNum = len(unitsPerHiddenLayer)
		self.unitsPerHiddenLayer = unitsPerHiddenLayer

		if outputFunc == "softmax":
			self.outputFunc = softmax
		else:
			raise ValueError("not implemented")

		if hiddenLayerFunc == "sigmoid":
			self.hiddenLayerFunc = sigmoid
			self.hiddenLayerFunc_gradient = sigmoid_gradient

		elif hiddenLayerFunc == "ReLU":
			self.hiddenLayerFunc = ReLU
			self.hiddenLayerFunc_gradient = ReLU_gradient
		else:
			raise ValueError("not Implemented")

		self.LR = learningRate
		self.epochs = epochs

		self.w = []  # w[0] and b[0] are used from the data(a[0]) to z[0], w[1] and b[1] are used from a[1] to z[1]
		self.b = []
		self.a = []  # a[0] is the data or thr output of the data layers, a[1] is the output of the first layer
		self.z = []  # z[0] is the input of the first hidden layers


	def fit(self,X,Y):
		d = X.shape[1] # data feature dimension
		n = X.shape[0] # number of the data samples
		k = len(set(Y)) # number of classes

		# print d,n,k

		# initialize the parameter

		# initialize the input layers
		self.a.append(np.zeros(d))

		# initialize the hidden layers
		for i in range(self.hiddenLayerNum):
			L = self.unitsPerHiddenLayer[i]
			self.w.append((np.random.rand(L,d)-0.5)/3.0) # -0.5 to reduce the initial weights to [-0.5,0.5], /3 to reduce to a even smaller weights
			self.b.append((np.random.rand(L)-0.5)/3.0)

			self.z.append(np.zeros((L,1)))
			self.a.append(np.zeros((L,1)))

		# initialize the output layers
		self.w.append((np.random.rand(k,L)-0.5)/3.0)
		self.b.append((np.random.rand(k)-0.5)/3.0)
		self.z.append(np.zeros(k))

		# start fitting using SGD
		for e in range(self.epochs):
			# print "epochs:",e
			sampleIndex = np.random.choice(n,n,replace = False) # generate random numbers

			for train_iter in sampleIndex:
				j = train_iter

				f = self.feedForward(X[j])
				delta = self.backPropagation(f,X[j],Y[j])

				# print "finish delta"
				# update the parameters

				for i in range(self.hiddenLayerNum+1): # +1 to update the output layers
					self.w[i] += self.LR*delta["w"][i]
					self.b[i] += self.LR*delta["b"][i]


	def feedForward(self,x):
		# feedForward function to evaluate
		self.a[0] = x.T

		for i in range(self.hiddenLayerNum):
			self.z[i] = np.dot(self.w[i], self.a[i]) + self.b[i]  # w[0] shape: (L,d); a[0] shape: (d,1) for the first hidden layer
			self.a[i+1] = self.hiddenLayerFunc(self.z[i])

		self.z[-1] = np.dot(self.w[-1],self.a[-1]) + self.b[-1] # hidden layer output to the output layers

		output = self.outputFunc(self.z[-1]) # output the probability of each class

		return output # probability of each class

	def backPropagation(self,f,X,y):
		'''
			Back-Propagation algorithm to fit
			with stochastic gradient descent
			currently only fit for the one hidden layers
			# costF is an array of the output of the softmax function
			# x is the sample
			# y is the label from 0,1,2,..k
		'''

		delta3 = -f  # probability, shape (k,)
		delta3[y] = delta3[y]+1
		deltaW2 = np.dot(delta3.reshape(-1,1),self.a[1].reshape(1,-1)) # shape (k,1) dot shape(1,L)
		deltab2 = delta3

		# print "delta3.shape",delta3.shape
		# print "deltaW2",deltaW2.shape
		# print self.z[0]
		# print self.w[1]
		# print np.dot(self.w[1],delta3)

		delta2 = np.dot(self.w[1].T,delta3) * self.hiddenLayerFunc_gradient(self.z[0])  # shape (L,)

		# print "delta2.shape",delta2.shape

		deltaW1 = np.dot(delta2.reshape(-1,1),X.reshape(1,-1))
		deltab1 = delta2

		return {"w":[deltaW1,deltaW2],"b":[deltab1,deltab2]}

	def predict(self,X):
		return np.array([np.argmax(self.feedForward(sample)) for sample in X])


	def performanceEval(self,testX,testY):
		return sum(self.predict(testX) == testY)/len(testY)


def test(uN,f,LR,epochs):
	print "start"
	nn = myNeuralNetwork(unitsPerHiddenLayer = [uN],hiddenLayerFunc = f, learningRate = LR,epochs=epochs)
	train, test = chainer.datasets.get_mnist()
	# train[i][0] is the image, train[i][1] is the label

	trainX = np.array([sample[0].flatten() for sample in train])
	trainY = np.array([sample[1] for sample in train])

	testX = np.array([sample[0].flatten() for sample in test])
	testY = np.array([sample[1] for sample in test])

	nn.fit(X = trainX, Y = trainY)
	print "units Num: %d, hidden layer func: %s, learning rate: %f, epochs: %d, accuracy: %f" %(uN,f,LR,epochs,nn.performanceEval(testX, testY))

def main():
	# test(50, "sigmoid", 0.07, 1)
	# test(50, "sigmoid", 0.07, 5)

	# test(100, "sigmoid", 0.07, 1)
	# test(100, "sigmoid", 0.07, 5)

	# test(200, "sigmoid", 0.07, 1)
	# test(200, "sigmoid", 0.07, 5)

	# test(50, "ReLU", 0.07, 1)
	# test(50, "ReLU", 0.07, 5)

	# test(100, "ReLU", 0.07, 1)
	# test(100, "ReLU", 0.07, 5)

	# test(200, "ReLU", 0.07, 1)
	# test(200, "ReLU", 0.07, 5)

	#test(50, "sigmoid", 0.05, 5)
	#test(50, "sigmoid", 0.1, 5)
	#test(50, "sigmoid", 0.5, 5)

	#test(50, "ReLU", 0.05, 5)
	#test(50, "ReLU", 0.1, 5)
	# test(50, "ReLU", 0.5, 5)
	# test(50, "ReLU", 0.01, 5)
	test(50, "sigmoid", 0.01, 5)
	test(10, "sigmoid", 0.01, 5)


if __name__ == '__main__':
	main()
	from __future__ import division
	import numpy as np
	import chainer # only for import the data at this script


	def softmax(z):
	# z is a vector
	return np.exp(z) / np.sum(np.exp(z))

	def sigmoid(x):
	# x can be a vector
	return 1.0/(1.0+np.exp(-x))

	def sigmoid_gradient(x):
	# x can be a vector
	return sigmoid(x)*(1-sigmoid(x))

	def ReLU(x):
	# x can be a vector
	return np.maximum(x,0)

	def ReLU_gradient(x):
	# x can be a vector
	return 1.0*(x>0)

	class myNeuralNetwork(object):

	def __init__(self,unitsPerHiddenLayer = [3], outputFunc = "softmax",hiddenLayerFunc = "ReLU",learningRate = 0.1,epochs = 3):
	'''
	# Currently, only single hidden layer is implemented
	'''
	self.hiddenLayerNum = len(unitsPerHiddenLayer)
	self.unitsPerHiddenLayer = unitsPerHiddenLayer

	if outputFunc == "softmax":
	self.outputFunc = softmax
	else:
	raise ValueError("not implemented")

	if hiddenLayerFunc == "sigmoid":
	self.hiddenLayerFunc = sigmoid
	self.hiddenLayerFunc_gradient = sigmoid_gradient

	elif hiddenLayerFunc == "ReLU":
	self.hiddenLayerFunc = ReLU
	self.hiddenLayerFunc_gradient = ReLU_gradient
	else:
	raise ValueError("not Implemented")

	self.LR = learningRate
	self.epochs = epochs

	self.w = [] # w[0] and b[0] are used from the data(a[0]) to z[0], w[1] and b[1] are used from a[1] to z[1]
	self.b = []
	self.a = [] # a[0] is the data or thr output of the data layers, a[1] is the output of the first layer
	self.z = [] # z[0] is the input of the first hidden layers


	def fit(self,X,Y):
	d = X.shape[1] # data feature dimension
	n = X.shape[0] # number of the data samples
	k = len(set(Y)) # number of classes

	# print d,n,k

	# initialize the parameter

	# initialize the input layers
	self.a.append(np.zeros(d))

	# initialize the hidden layers
	for i in range(self.hiddenLayerNum):
	L = self.unitsPerHiddenLayer[i]
	self.w.append((np.random.rand(L,d)-0.5)/3.0) # -0.5 to reduce the initial weights to [-0.5,0.5], /3 to reduce to a even smaller weights
	self.b.append((np.random.rand(L)-0.5)/3.0)

	self.z.append(np.zeros((L,1)))
	self.a.append(np.zeros((L,1)))

	# initialize the output layers
	self.w.append((np.random.rand(k,L)-0.5)/3.0)
	self.b.append((np.random.rand(k)-0.5)/3.0)
	self.z.append(np.zeros(k))

	# start fitting using SGD
	for e in range(self.epochs):
	# print "epochs:",e
	sampleIndex = np.random.choice(n,n,replace = False) # generate random numbers

	for train_iter in sampleIndex:
	j = train_iter

	f = self.feedForward(X[j])
	delta = self.backPropagation(f,X[j],Y[j])

	# print "finish delta"
	# update the parameters

	for i in range(self.hiddenLayerNum+1): # +1 to update the output layers
	self.w[i] += self.LR*delta["w"][i]
	self.b[i] += self.LR*delta["b"][i]



	def feedForward(self,x):
	# feedForward function to evaluate
	self.a[0] = x.T

	for i in range(self.hiddenLayerNum):
	self.z[i] = np.dot(self.w[i], self.a[i]) + self.b[i] # w[0] shape: (L,d); a[0] shape: (d,1) for the first hidden layer
	self.a[i+1] = self.hiddenLayerFunc(self.z[i])

	self.z[-1] = np.dot(self.w[-1],self.a[-1]) + self.b[-1] # hidden layer output to the output layers

	output = self.outputFunc(self.z[-1]) # output the probability of each class

	return output # probability of each class

	def backPropagation(self,f,X,y):
	'''
	Back-Propagation algorithm to fit
	with stochastic gradient descent
	currently only fit for the one hidden layers
	# costF is an array of the output of the softmax function
	# x is the sample
	# y is the label from 0,1,2,..k
	'''

	delta3 = -f # probability, shape (k,)
	delta3[y] = delta3[y]+1
	deltaW2 = np.dot(delta3.reshape(-1,1),self.a[1].reshape(1,-1)) # shape (k,1) dot shape(1,L)
	deltab2 = delta3

	# print "delta3.shape",delta3.shape
	# print "deltaW2",deltaW2.shape
	# print self.z[0]
	# print self.w[1]
	# print np.dot(self.w[1],delta3)

	delta2 = np.dot(self.w[1].T,delta3) * self.hiddenLayerFunc_gradient(self.z[0]) # shape (L,)

	# print "delta2.shape",delta2.shape

	deltaW1 = np.dot(delta2.reshape(-1,1),X.reshape(1,-1))
	deltab1 = delta2

	return {"w":[deltaW1,deltaW2],"b":[deltab1,deltab2]}

	def predict(self,X):
	return np.array([np.argmax(self.feedForward(sample)) for sample in X])


	def performanceEval(self,testX,testY):
	return sum(self.predict(testX) == testY)/len(testY)


	def test(uN,f,LR,epochs):
	print "start"
	nn = myNeuralNetwork(unitsPerHiddenLayer = [uN],hiddenLayerFunc = f, learningRate = LR,epochs=epochs)
	train, test = chainer.datasets.get_mnist()
	# train[i][0] is the image, train[i][1] is the label

	trainX = np.array([sample[0].flatten() for sample in train])
	trainY = np.array([sample[1] for sample in train])

	testX = np.array([sample[0].flatten() for sample in test])
	testY = np.array([sample[1] for sample in test])

	nn.fit(X = trainX, Y = trainY)
	print "units Num: %d, hidden layer func: %s, learning rate: %f, epochs: %d, accuracy: %f" %(uN,f,LR,epochs,nn.performanceEval(testX, testY))

	def main():
	# test(50, "sigmoid", 0.07, 1)
	# test(50, "sigmoid", 0.07, 5)

	# test(100, "sigmoid", 0.07, 1)
	# test(100, "sigmoid", 0.07, 5)

	# test(200, "sigmoid", 0.07, 1)
	# test(200, "sigmoid", 0.07, 5)

	# test(50, "ReLU", 0.07, 1)
	# test(50, "ReLU", 0.07, 5)

	# test(100, "ReLU", 0.07, 1)
	# test(100, "ReLU", 0.07, 5)

	# test(200, "ReLU", 0.07, 1)
	# test(200, "ReLU", 0.07, 5)

	#test(50, "sigmoid", 0.05, 5)
	#test(50, "sigmoid", 0.1, 5)
	#test(50, "sigmoid", 0.5, 5)

	#test(50, "ReLU", 0.05, 5)
	#test(50, "ReLU", 0.1, 5)
	# test(50, "ReLU", 0.5, 5)
	# test(50, "ReLU", 0.01, 5)
	test(50, "sigmoid", 0.01, 5)
	test(10, "sigmoid", 0.01, 5)


	if __name__ == '__main__':
	main()