AbhinavMadahar/NeuralNetwork.py

## main.py
from NeuralNetwork import NeuralNetwork
from random import shuffle, choice

k = 6 # length of the word
test_data_training_size = 100
hidden_layers_sizes = [10, 10]
epochs = 15
eta = 0.25

letters = [chr(ordinal) for ordinal in xrange(ord("a"), ord("z") + 1)]
random_letter = lambda: choice(letters)
random_letters = lambda k: [random_letter() for n in xrange(k)]
ordinals = lambda word: [ord(letter) - ord('a') for letter in word]
deordinalize = lambda o: "".join([chr(ordinal + ord('a')) for ordinal in o])

with open("words.txt", "r") as words_file:
    # each word has a carriage return "\r\n" at the end, so a word that appears to be k-long is
    # actually k+2 long, so the code will take care of that for us
    words = [word[0:6] for word in words_file if len(word) == k+2]

shuffle(words)

net = NeuralNetwork([k] + hidden_layers_sizes + [1])

# train the neural network
# don't train with the last test_data_training_size words
n_real_words = len(words) - test_data_training_size
training_data = []
training_data += [(ordinals(word), 1) for word in words[0:-test_data_training_size]] # real words
training_data += [(ordinals(random_letters(k)), 0) for n in xrange(n_real_words)]

shuffle(training_data)

net.train(training_data, epochs, eta)

testing_data = []
testing_data += [(ordinals(word), 1) for word in words[-test_data_training_size:]]
testing_data += [(ordinals(random_letters(k)), 0) for n in xrange(test_data_training_size)]
shuffle(testing_data)
total_correct = 0
for word, exists in testing_data:
    if exists == round(net.feedforward(word)[0]):
        total_correct += 1
print total_correct, "correct out of", 2 * test_data_training_size

## NeuralNetwork.py
import numpy as np
from random import shuffle
from math import sqrt

sigmoid = lambda z: 1 / (1 + np.exp(-z))
sigmoidprime = lambda z: sigmoid(z) * (1 - sigmoid(z)) # derivate of sigmoid function

def chunk(array, n):
    chunks = []
    for i in xrange(0, len(array), n):
        chunks.append(array[i:i+n])
    return chunks

class NeuralNetwork(object):
    # sizes is the size of each layer in the network
    def __init__(self, sizes):
        self.sizes = sizes
        self.biases = [np.random.randn(y) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

    # letters is a list of letters that form a word
    def feedforward(self, letters):
        a = letters
        for b, w in zip(self.biases, self.weights):
            # because w is a matrix, np.dot(w, a) is equivilant to matrix multiplication
            a = sigmoid(np.dot(w, a) + b)
        return a

    # applies gradient descent on the entire data set at once because it's small
    # epochs is the number of times to apply gradient descent
    # let words be a list of tuples of the form (x, y) where x is the word or fake word and y be
    # 0 if it is fake and 1 if it is true
    # eta is learning rate
    def train(self, words, epochs, eta):
        eta = float(eta) # sorry for the type-casting lol

        for n in xrange(epochs): # for epochs number of times
            print "Epoch", n, "starting..."
            nabla_b = [np.zeros(b.shape) for b in self.biases] # grad. of cost w/ respect to bias
            nabla_w = [np.zeros(w.shape) for w in self.weights] # grad. of cost w/ respect to weight

            # # calculate and add the necessary changes to bias and weight for each individual word
            for word, exists in words:
                delta_nabla_b, delta_nabla_w = self.backprop(word, exists)
                nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
                nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

            self.weights = [w - (eta / len(words)) * nw for w, nw in zip(self.weights, nabla_w)]
            self.biases = [b - (eta / len(words)) * nb for b, nb in zip(self.biases, nabla_b)]

            print "Epoch completed"

    def backprop(self, x, y):
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        nabla_b = [np.zeros(b.shape) for b in self.biases]

        activation = x # activation is input
        activations = [x] # store the activations layer-wise
        zs = [] # matrix for z values

        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b # find the new weighed input
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        # now go back

        # let's do this literately
        # backpropagation is all about delta, the partial derivative of C with respect to z[l][j]
        # z[l][j] is the weighed input (w[l][j] dot a + b) of the jth neuron of the lth layer
        # a[l][j] = sigmoid(w[l][j] dot a + b)
        # a[l][j] = sigmoid(z[l][j])
        # we will now use (BP1) to find delta
        # we will refer to the partial derivate of C with respect to a[l][j] as dC/da
        # dC/dz = dC/da * da/dz
        # delta = dC/da * d/dz(sigmoid(z))
        # delta = dC/da * sigmoid'(z)
        # we can calculate dC/da with self.dCda and we have the sigmoidprime function already
        # now, we can calculate delta
        dCda = self.dCda(activations[-1], y)
        dadz = sigmoidprime(zs[-1])
        delta = dCda * dadz

        # if b is the bias of a given neuron with weighed input z,
        # dz/db = 1
        # dC/db = dC/dz * dz/db
        # dC/db = dC/dz * 1
        # dC/db = dC/dz
        # dC/db = delta (i.e. BP3)
        # apply this for all the neurons in a given layer to get nabla_b, the gradient of cost with
        # respect to the biases of the neurons in the current layer, the output layer
        nabla_b[-1] = delta

        # we can now calculate the gradient of cost with respect to the weights of the output layer
        # self.weights[l][j][k] is the weight of an input from k to j, but error (delta) propagates
        # backwards through the neural network, so we want the weight from j to k, which we can
        # find by transposing self.weights[l] (i.e. np.transpose(self.weights[l]))
        # because we're trying to found the gradient of the cost with respect to the weight of the
        # connections going to the output layer, -1, we need to find nabla_w[-1]
        # nabla_w[l][j][k] = a[l-1][k] x delta[l][j]
        # where a[l-1][k] is the vector of activations of the neurons in the previous layer
        nabla_w[-1] = np.matmul([[d] for d in delta], [activations[-2]])

        for l in xrange(2, len(self.sizes)):
            z = zs[-l]
            zp = sigmoidprime(z) # z'
            delta = np.dot(np.transpose(self.weights[-l+1]), delta) * zp
            nabla_b[-l] = delta
            nabla_w[-l] = np.matmul([[d] for d in delta], [activations[-l-1]])

        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        return sum((y - self.feedforward(x)) ** 2 for x, y in test_data) / (len(test_data) * 2)

    def dCda(self, output_activations, y):
        return output_activations - y
	from NeuralNetwork import NeuralNetwork
	from random import shuffle, choice

	k = 6 # length of the word
	test_data_training_size = 100
	hidden_layers_sizes = [10, 10]
	epochs = 15
	eta = 0.25

	letters = [chr(ordinal) for ordinal in xrange(ord("a"), ord("z") + 1)]
	random_letter = lambda: choice(letters)
	random_letters = lambda k: [random_letter() for n in xrange(k)]
	ordinals = lambda word: [ord(letter) - ord('a') for letter in word]
	deordinalize = lambda o: "".join([chr(ordinal + ord('a')) for ordinal in o])

	with open("words.txt", "r") as words_file:
	# each word has a carriage return "\r\n" at the end, so a word that appears to be k-long is
	# actually k+2 long, so the code will take care of that for us
	words = [word[0:6] for word in words_file if len(word) == k+2]

	shuffle(words)

	net = NeuralNetwork([k] + hidden_layers_sizes + [1])

	# train the neural network
	# don't train with the last test_data_training_size words
	n_real_words = len(words) - test_data_training_size
	training_data = []
	training_data += [(ordinals(word), 1) for word in words[0:-test_data_training_size]] # real words
	training_data += [(ordinals(random_letters(k)), 0) for n in xrange(n_real_words)]

	shuffle(training_data)

	net.train(training_data, epochs, eta)

	testing_data = []
	testing_data += [(ordinals(word), 1) for word in words[-test_data_training_size:]]
	testing_data += [(ordinals(random_letters(k)), 0) for n in xrange(test_data_training_size)]
	shuffle(testing_data)
	total_correct = 0
	for word, exists in testing_data:
	if exists == round(net.feedforward(word)[0]):
	total_correct += 1
	print total_correct, "correct out of", 2 * test_data_training_size
	import numpy as np
	from random import shuffle
	from math import sqrt

	sigmoid = lambda z: 1 / (1 + np.exp(-z))
	sigmoidprime = lambda z: sigmoid(z) * (1 - sigmoid(z)) # derivate of sigmoid function

	def chunk(array, n):
	chunks = []
	for i in xrange(0, len(array), n):
	chunks.append(array[i:i+n])
	return chunks

	class NeuralNetwork(object):
	# sizes is the size of each layer in the network
	def __init__(self, sizes):
	self.sizes = sizes
	self.biases = [np.random.randn(y) for y in sizes[1:]]
	self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

	# letters is a list of letters that form a word
	def feedforward(self, letters):
	a = letters
	for b, w in zip(self.biases, self.weights):
	# because w is a matrix, np.dot(w, a) is equivilant to matrix multiplication
	a = sigmoid(np.dot(w, a) + b)
	return a

	# applies gradient descent on the entire data set at once because it's small
	# epochs is the number of times to apply gradient descent
	# let words be a list of tuples of the form (x, y) where x is the word or fake word and y be
	# 0 if it is fake and 1 if it is true
	# eta is learning rate
	def train(self, words, epochs, eta):
	eta = float(eta) # sorry for the type-casting lol

	for n in xrange(epochs): # for epochs number of times
	print "Epoch", n, "starting..."
	nabla_b = [np.zeros(b.shape) for b in self.biases] # grad. of cost w/ respect to bias
	nabla_w = [np.zeros(w.shape) for w in self.weights] # grad. of cost w/ respect to weight

	# # calculate and add the necessary changes to bias and weight for each individual word
	for word, exists in words:
	delta_nabla_b, delta_nabla_w = self.backprop(word, exists)
	nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
	nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

	self.weights = [w - (eta / len(words)) * nw for w, nw in zip(self.weights, nabla_w)]
	self.biases = [b - (eta / len(words)) * nb for b, nb in zip(self.biases, nabla_b)]

	print "Epoch completed"

	def backprop(self, x, y):
	nabla_w = [np.zeros(w.shape) for w in self.weights]
	nabla_b = [np.zeros(b.shape) for b in self.biases]

	activation = x # activation is input
	activations = [x] # store the activations layer-wise
	zs = [] # matrix for z values

	for b, w in zip(self.biases, self.weights):
	z = np.dot(w, activation) + b # find the new weighed input
	zs.append(z)
	activation = sigmoid(z)
	activations.append(activation)

	# now go back

	# let's do this literately
	# backpropagation is all about delta, the partial derivative of C with respect to z[l][j]
	# z[l][j] is the weighed input (w[l][j] dot a + b) of the jth neuron of the lth layer
	# a[l][j] = sigmoid(w[l][j] dot a + b)
	# a[l][j] = sigmoid(z[l][j])
	# we will now use (BP1) to find delta
	# we will refer to the partial derivate of C with respect to a[l][j] as dC/da
	# dC/dz = dC/da * da/dz
	# delta = dC/da * d/dz(sigmoid(z))
	# delta = dC/da * sigmoid'(z)
	# we can calculate dC/da with self.dCda and we have the sigmoidprime function already
	# now, we can calculate delta
	dCda = self.dCda(activations[-1], y)
	dadz = sigmoidprime(zs[-1])
	delta = dCda * dadz

	# if b is the bias of a given neuron with weighed input z,
	# dz/db = 1
	# dC/db = dC/dz * dz/db
	# dC/db = dC/dz * 1
	# dC/db = dC/dz
	# dC/db = delta (i.e. BP3)
	# apply this for all the neurons in a given layer to get nabla_b, the gradient of cost with
	# respect to the biases of the neurons in the current layer, the output layer
	nabla_b[-1] = delta

	# we can now calculate the gradient of cost with respect to the weights of the output layer
	# self.weights[l][j][k] is the weight of an input from k to j, but error (delta) propagates
	# backwards through the neural network, so we want the weight from j to k, which we can
	# find by transposing self.weights[l] (i.e. np.transpose(self.weights[l]))
	# because we're trying to found the gradient of the cost with respect to the weight of the
	# connections going to the output layer, -1, we need to find nabla_w[-1]
	# nabla_w[l][j][k] = a[l-1][k] x delta[l][j]
	# where a[l-1][k] is the vector of activations of the neurons in the previous layer
	nabla_w[-1] = np.matmul([[d] for d in delta], [activations[-2]])

	for l in xrange(2, len(self.sizes)):
	z = zs[-l]
	zp = sigmoidprime(z) # z'
	delta = np.dot(np.transpose(self.weights[-l+1]), delta) * zp
	nabla_b[-l] = delta
	nabla_w[-l] = np.matmul([[d] for d in delta], [activations[-l-1]])

	return (nabla_b, nabla_w)

	def evaluate(self, test_data):
	return sum((y - self.feedforward(x)) ** 2 for x, y in test_data) / (len(test_data) * 2)

	def dCda(self, output_activations, y):
	return output_activations - y