ljvmiranda921/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Implementing a multi-layer perceptron to solve the two-spiral problem

This utilizes a three-layer neural network (2 hidden layers with tanh and 1 output layer with softmax) to solve the two-spiral problem.
Included in this gist is data_utils.py which has the method load_twin_spiral() in order to generate the data. All of the computations in the neural network (feedforward and backpropagation) are done using the numpy package.
Usage

If you wish to use the classes in this gist, simply import the module network and load the class:
from network import *
from data_utils import *

# Load the data
X, y = load_twin_spiral()

# Build the network and train
model = MLP()
stats = model.train(X,y)
y_pred = model.predict(X)

License

All public gists https://gist.github.com/ljvmiranda921

Copyright 2017, Lester James V. Miranda

MIT License, http://www.opensource.org/licenses/mit-license.php

  
## data_util.py
# -*- coding: utf-8 -*-
"""Data Utils

Simple module to load Twin-Spiral Data

__author__ = "Lester James V. Miranda"
__email__ = "lester.miranda@toki.waseda.jp"
"""

import numpy as np

def load_twin_spiral(n_instances=200, degrees=780, start=0, noise=0.2, one_hot=False, randomize=False, random_state=None):
	"""Generates the twin-spiral dataset.

	Inputs:
		- n_instance: number of instances
		- degrees: the length of the spirals
		- start: offset of the spiral from the origin.
		- noise: amount of noise to be introduced in the spiral
		- one_hot: perform one_host encoding on data
		- randomize: add stochasticity to the generated values.
		- random_state: initiates a random seed

		Note that for the noise parameter, a value of 0 means no noise
		and 1 means high-amount of noise (possibility of the two spirals overlapping).

	Returns:
		- X, y: numpy nd-array containing the coordinates and labels of shape
					(n_instances, 2) for X and
					(n_instances) for y
	"""

	if random_state is not None:
		np.random.seed(random_state)

	rads = (2 * np.pi) / 360
	start = start * rads

	if randomize == True:
		n = start + np.sqrt(np.random.rand(n_instances, 1)) * degrees * rads
		d1x = - np.cos(n) * n + np.random.rand(n_instances,1) * noise
		d1y = np.sin(n)*n + np.random.rand(n_instances,1) * noise
	else:
		i = np.array(list(range(n_instances))).reshape(n_instances,1)
		r = (6.5 * (104-i)) / 104
		phi = (i * np.pi) / 16
		d1x = -r * np.cos(phi)
		d1y = -r * np.sin(phi)


	# Define class 0 and class 1
	if one_hot == True:
		class_zero = np.array([[1,0]] * n_instances)
		class_one = np.array([[0,1]] * n_instances)
		labels = np.vstack((class_zero, class_one))
	else:
		class_zero = np.zeros(n_instances).astype(int)
		class_one = np.ones(n_instances).astype(int)
		labels = np.hstack((class_zero, class_one))


	return (np.vstack((np.hstack((d1x,d1y)),np.hstack((-d1x,-d1y)))),
			labels)

## network.py
# -*- coding: utf-8 -*-
"""Neural Network class for Assignment 3

This module implements the class MLP, a three layer Neural network
with tanh activation function for solving the two-spiral problem.

__author__ = "Lester James V. Miranda"
__email__ = "lester.miranda@toki.waseda.jp"
"""
import numpy as np

class MLP(object):
	"""This is a three-layer neural network for solving the two-spiral problem
	for the Neural Networks Class Spring 2017. The network has one hidden layer,
	and has a tanh activation function after the first fully-connected net. Thus,

	input_layer ---- hidden_layer x 2 ---- output_layer
						[tanh]         [softmax]

	To use this class, simply initialize the model and train it.
		model = TwoLayerNet() # Assuming you are using the default parameters
		model.fit(X)
		pred = model.predict(y)

	"""
	def __init__(self, n_inputs=2, n_hidden=20, n_classes=2, std=1e-4):
		"""Initializes the parameters of the neural network.
		Here, we are initializing the weights into small values, whereas
		the biases are initialized to zero.

		Inputs:
			- input_size: dimensions of the input.
			- hidden_size: nb. of nodes in the hidden layer.
			- num_classes: nb. of classes in the output layer
			- std: controls the spread of sampling from a normal distrib.

		"""

		# Initialize the parameters
		self.params = {}

		# First layer weights and biases
		self.params['W1'] = std * np.random.randn(n_inputs, n_hidden)
		self.params['b1'] = np.random.randn(n_hidden)

		# Second layer weights and biases
		self.params['W2'] = std * np.random.randn(n_hidden, n_hidden)
		self.params['b2'] = np.random.randn(n_hidden)

		# Output layer weights and biases
		self.params['W3'] = std * np.random.randn(n_hidden, n_classes)
		self.params['b3'] = np.random.randn(n_classes)

		# Initialize the velocities
		self.velocity = {}

		# First layer velocity
		self.velocity['W1'] = np.zeros((n_inputs, n_hidden))
		self.velocity['b1'] = np.zeros(n_hidden)

		# Second layer weights and biases
		self.velocity['W2'] = np.zeros((n_hidden, n_hidden))
		self.velocity['b2'] = np.zeros(n_hidden)

		# Output layer weights and biases
		self.velocity['W3'] = np.zeros((n_hidden, n_classes))
		self.velocity['b3'] = np.zeros(n_classes)


	def loss(self, X, y=None, reg=0.0):
		"""
		Compute the loss and gradients for a two layer fully connected neural
		network.

		Inputs:
			- X: input data of shape (n_examples, n_features).
			- y: vector of training labels.
			- reg: Regularization strength.

		Returns:
			- If y is None, then returns score matrix.
			- If y is not none, return
				loss: computed loss (both data loss and regularization loss)
				grads: dictionary containing all the gradients.

		"""

		W1, b1 = self.params['W1'], self.params['b1']
		W2, b2 = self.params['W2'], self.params['b2']
		W3, b3 = self.params['W3'], self.params['b3']
		N, D = X.shape

		#----------------- Forward propagation ------------------

		scores = None
		z1 = X.dot(W1) + b1			# First layer pre-activation
		a1 = np.tanh(z1) 			# First layer activation (using tanh)
		z2 = a1.dot(W2) + b2		# Second layer pre-activation
		a2 = np.tanh(z2)			# Second layer activation (using tanh)
		z3 = a2.dot(W3) + b3		# Third layer pre-activation
		logits = z3					# Keep logits

		if y is None:
		  return logits

		#------------ Perform softmax cross-entropy ------------

		# Compute for the softmax (activation function)
		loss = None
		exp_scores = np.exp(logits)
		probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

		# Compute for cross-entropy loss
		corect_logprobs = -np.log(probs[range(N), y])
		data_loss = np.sum(corect_logprobs) / N
		reg_loss = 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2) + np.sum(W3 * W3))
		loss = data_loss + reg_loss

		#------------------- Backpropagation --------------------
		grads = {}

		# Compute the logits gradients
		dlogits = probs
		dlogits[range(N),y] -= 1
		dlogits /= N

		# Propagate the loss back to the output layer
		grads['W3'] = np.dot(a2.T, dlogits)
		grads['b3'] = np.sum(dlogits, axis=0)

		# Compute the hidden layer 2 gradient
		dhidden_2 = np.multiply(self._tanh_deriv(z2), np.dot(dlogits, W3.T))

		# Propagate the loss back to hidden layer 2
		grads['W2'] = np.dot(a1.T, dhidden_2)
		grads['b2'] = np.sum(dhidden_2, axis =0)

		# Compute the hidden layer 1 gradient
		dhidden_1 = np.multiply(self._tanh_deriv(z1), np.dot(dhidden_2, W2.T))

		# Propagate the loss back to hidden layer 1
		grads['W1'] = np.dot(X.T, dhidden_1)
		grads['b1'] = np.sum(dhidden_1, axis=0)

		# Accumulate gradients in reg term
		grads['W3'] += reg * W3
		grads['W2'] += reg * W2
		grads['W1'] += reg * W1

		return loss, grads

	def train(self, X, y, learning_rate=0.5, mu=0.05, num_iters=50000,print_step=1000,
				reg_param=0.0,verbose=1):
		"""Trains the created neural network model using the parameters stated.

		Inputs:
			- X: Input data of shape (N, D). Each X[i] is a training sample. If passing a single example signal, use np.array([[i]]).
			- y: Label data d(t). If passing a single example signal, use np.array([[i]]).
			- learning_rate: learning rate to be used in stochastic gradient descent
			- num_iters: number of iterations.
			- verbose: prints progress during optimization proper.
		"""

		# Define history list
		loss_history = []
		train_acc_history = []

		for i in range(num_iters):
			# Perform forward propagation and compute for the loss.
			loss, grads = self.loss(X,y, reg_param)

			# Append all returned variables in a history list.
			loss_history.append(loss)

			# Add momentum
			self.velocity['W1'] = (self.velocity['W1'] * mu) - learning_rate * grads['W1']
			self.velocity['b1'] = (self.velocity['b1'] * mu) - learning_rate * grads['b1']
			self.velocity['W2'] = (self.velocity['W2'] * mu) - learning_rate * grads['W2']
			self.velocity['b2'] = (self.velocity['b2'] * mu) - learning_rate * grads['b2']
			self.velocity['W3'] = (self.velocity['W3'] * mu) - learning_rate * grads['W3']
			self.velocity['b3'] = (self.velocity['b3'] * mu) - learning_rate * grads['b3']

			# Adjust the neural network parameters
			self.params['W1'] += self.velocity['W1']
			self.params['b1'] += self.velocity['b1']
			self.params['W2'] += self.velocity['W2']
			self.params['b2'] += self.velocity['b2']
			self.params['W3'] += self.velocity['W3']
			self.params['b3'] += self.velocity['b3']

			# Check accuracy
			train_acc = (self.predict(X) == y).mean()
			train_acc_history.append(train_acc)

			if (verbose>=2) and (i % print_step == 0):
				print('Iteration %d / %d: loss %f, acc %f' % (i+1, num_iters, loss, train_acc))


		if (verbose>=1):
			print('Done! loss: %f, acc: %f' %(loss, train_acc))

		return {'loss_history': loss_history,
				'acc_history': train_acc_history}

	def predict(self, X):
		"""Use the trained weights of the neural network to determine the class.
		The way this works is that it performs a feedforward propagation to compute
		for the logits, and use the logits to get the max.

		Inputs:
			- X: numpy ndarray of shape (N,D) giving N D-dimensional data points to
				clasify.

		Returns:
			- y_pred: numpy ndarray prediction of shape (N,).
		"""
		z1 = X.dot(self.params['W1']) + self.params['b1']
		a1 = np.tanh(z1)
		z2 = a1.dot(self.params['W2']) + self.params['b2']
		a2 = np.tanh(z2)
		z3 = a2.dot(self.params['W3']) + self.params['b3']
		logits = z3
		y_pred = np.argmax(logits, axis=1)

		return y_pred

	def _tanh_deriv(self,x):
		"""Helper function to compute for the first-derivative of tanh

		Input:
			- x: argument to compute the derivative from.
		"""
		return 1.0 - np.tanh(x)**2
	# -- coding: utf-8 --
	"""Data Utils

	Simple module to load Twin-Spiral Data

	__author__ = "Lester James V. Miranda"
	__email__ = "lester.miranda@toki.waseda.jp"
	"""

	import numpy as np

	def load_twin_spiral(n_instances=200, degrees=780, start=0, noise=0.2, one_hot=False, randomize=False, random_state=None):
	"""Generates the twin-spiral dataset.

	Inputs:
	- n_instance: number of instances
	- degrees: the length of the spirals
	- start: offset of the spiral from the origin.
	- noise: amount of noise to be introduced in the spiral
	- one_hot: perform one_host encoding on data
	- randomize: add stochasticity to the generated values.
	- random_state: initiates a random seed

	Note that for the noise parameter, a value of 0 means no noise
	and 1 means high-amount of noise (possibility of the two spirals overlapping).

	Returns:
	- X, y: numpy nd-array containing the coordinates and labels of shape
	(n_instances, 2) for X and
	(n_instances) for y
	"""

	if random_state is not None:
	np.random.seed(random_state)

	rads = (2 * np.pi) / 360
	start = start * rads

	if randomize == True:
	n = start + np.sqrt(np.random.rand(n_instances, 1)) * degrees * rads
	d1x = - np.cos(n) * n + np.random.rand(n_instances,1) * noise
	d1y = np.sin(n)n + np.random.rand(n_instances,1) noise
	else:
	i = np.array(list(range(n_instances))).reshape(n_instances,1)
	r = (6.5 * (104-i)) / 104
	phi = (i * np.pi) / 16
	d1x = -r * np.cos(phi)
	d1y = -r * np.sin(phi)


	# Define class 0 and class 1
	if one_hot == True:
	class_zero = np.array([[1,0]] * n_instances)
	class_one = np.array([[0,1]] * n_instances)
	labels = np.vstack((class_zero, class_one))
	else:
	class_zero = np.zeros(n_instances).astype(int)
	class_one = np.ones(n_instances).astype(int)
	labels = np.hstack((class_zero, class_one))



	return (np.vstack((np.hstack((d1x,d1y)),np.hstack((-d1x,-d1y)))),
	labels)
	# -- coding: utf-8 --
	"""Neural Network class for Assignment 3

	This module implements the class MLP, a three layer Neural network
	with tanh activation function for solving the two-spiral problem.

	__author__ = "Lester James V. Miranda"
	__email__ = "lester.miranda@toki.waseda.jp"
	"""
	import numpy as np

	class MLP(object):
	"""This is a three-layer neural network for solving the two-spiral problem
	for the Neural Networks Class Spring 2017. The network has one hidden layer,
	and has a tanh activation function after the first fully-connected net. Thus,

	input_layer ---- hidden_layer x 2 ---- output_layer
	[tanh] [softmax]

	To use this class, simply initialize the model and train it.
	model = TwoLayerNet() # Assuming you are using the default parameters
	model.fit(X)
	pred = model.predict(y)

	"""
	def __init__(self, n_inputs=2, n_hidden=20, n_classes=2, std=1e-4):
	"""Initializes the parameters of the neural network.
	Here, we are initializing the weights into small values, whereas
	the biases are initialized to zero.

	Inputs:
	- input_size: dimensions of the input.
	- hidden_size: nb. of nodes in the hidden layer.
	- num_classes: nb. of classes in the output layer
	- std: controls the spread of sampling from a normal distrib.

	"""

	# Initialize the parameters
	self.params = {}

	# First layer weights and biases
	self.params['W1'] = std * np.random.randn(n_inputs, n_hidden)
	self.params['b1'] = np.random.randn(n_hidden)

	# Second layer weights and biases
	self.params['W2'] = std * np.random.randn(n_hidden, n_hidden)
	self.params['b2'] = np.random.randn(n_hidden)

	# Output layer weights and biases
	self.params['W3'] = std * np.random.randn(n_hidden, n_classes)
	self.params['b3'] = np.random.randn(n_classes)

	# Initialize the velocities
	self.velocity = {}

	# First layer velocity
	self.velocity['W1'] = np.zeros((n_inputs, n_hidden))
	self.velocity['b1'] = np.zeros(n_hidden)

	# Second layer weights and biases
	self.velocity['W2'] = np.zeros((n_hidden, n_hidden))
	self.velocity['b2'] = np.zeros(n_hidden)

	# Output layer weights and biases
	self.velocity['W3'] = np.zeros((n_hidden, n_classes))
	self.velocity['b3'] = np.zeros(n_classes)



	def loss(self, X, y=None, reg=0.0):
	"""
	Compute the loss and gradients for a two layer fully connected neural
	network.

	Inputs:
	- X: input data of shape (n_examples, n_features).
	- y: vector of training labels.
	- reg: Regularization strength.

	Returns:
	- If y is None, then returns score matrix.
	- If y is not none, return
	loss: computed loss (both data loss and regularization loss)
	grads: dictionary containing all the gradients.

	"""

	W1, b1 = self.params['W1'], self.params['b1']
	W2, b2 = self.params['W2'], self.params['b2']
	W3, b3 = self.params['W3'], self.params['b3']
	N, D = X.shape

	#----------------- Forward propagation ------------------

	scores = None
	z1 = X.dot(W1) + b1 # First layer pre-activation
	a1 = np.tanh(z1) # First layer activation (using tanh)
	z2 = a1.dot(W2) + b2 # Second layer pre-activation
	a2 = np.tanh(z2) # Second layer activation (using tanh)
	z3 = a2.dot(W3) + b3 # Third layer pre-activation
	logits = z3 # Keep logits

	if y is None:
	return logits

	#------------ Perform softmax cross-entropy ------------

	# Compute for the softmax (activation function)
	loss = None
	exp_scores = np.exp(logits)
	probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

	# Compute for cross-entropy loss
	corect_logprobs = -np.log(probs[range(N), y])
	data_loss = np.sum(corect_logprobs) / N
	reg_loss = 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2) + np.sum(W3 * W3))
	loss = data_loss + reg_loss

	#------------------- Backpropagation --------------------
	grads = {}

	# Compute the logits gradients
	dlogits = probs
	dlogits[range(N),y] -= 1
	dlogits /= N

	# Propagate the loss back to the output layer
	grads['W3'] = np.dot(a2.T, dlogits)
	grads['b3'] = np.sum(dlogits, axis=0)

	# Compute the hidden layer 2 gradient
	dhidden_2 = np.multiply(self._tanh_deriv(z2), np.dot(dlogits, W3.T))

	# Propagate the loss back to hidden layer 2
	grads['W2'] = np.dot(a1.T, dhidden_2)
	grads['b2'] = np.sum(dhidden_2, axis =0)

	# Compute the hidden layer 1 gradient
	dhidden_1 = np.multiply(self._tanh_deriv(z1), np.dot(dhidden_2, W2.T))

	# Propagate the loss back to hidden layer 1
	grads['W1'] = np.dot(X.T, dhidden_1)
	grads['b1'] = np.sum(dhidden_1, axis=0)

	# Accumulate gradients in reg term
	grads['W3'] += reg * W3
	grads['W2'] += reg * W2
	grads['W1'] += reg * W1

	return loss, grads

	def train(self, X, y, learning_rate=0.5, mu=0.05, num_iters=50000,print_step=1000,
	reg_param=0.0,verbose=1):
	"""Trains the created neural network model using the parameters stated.

	Inputs:
	- X: Input data of shape (N, D). Each X[i] is a training sample. If passing a single example signal, use np.array([[i]]).
	- y: Label data d(t). If passing a single example signal, use np.array([[i]]).
	- learning_rate: learning rate to be used in stochastic gradient descent
	- num_iters: number of iterations.
	- verbose: prints progress during optimization proper.
	"""

	# Define history list
	loss_history = []
	train_acc_history = []

	for i in range(num_iters):
	# Perform forward propagation and compute for the loss.
	loss, grads = self.loss(X,y, reg_param)

	# Append all returned variables in a history list.
	loss_history.append(loss)

	# Add momentum
	self.velocity['W1'] = (self.velocity['W1'] * mu) - learning_rate * grads['W1']
	self.velocity['b1'] = (self.velocity['b1'] * mu) - learning_rate * grads['b1']
	self.velocity['W2'] = (self.velocity['W2'] * mu) - learning_rate * grads['W2']
	self.velocity['b2'] = (self.velocity['b2'] * mu) - learning_rate * grads['b2']
	self.velocity['W3'] = (self.velocity['W3'] * mu) - learning_rate * grads['W3']
	self.velocity['b3'] = (self.velocity['b3'] * mu) - learning_rate * grads['b3']

	# Adjust the neural network parameters
	self.params['W1'] += self.velocity['W1']
	self.params['b1'] += self.velocity['b1']
	self.params['W2'] += self.velocity['W2']
	self.params['b2'] += self.velocity['b2']
	self.params['W3'] += self.velocity['W3']
	self.params['b3'] += self.velocity['b3']

	# Check accuracy
	train_acc = (self.predict(X) == y).mean()
	train_acc_history.append(train_acc)

	if (verbose>=2) and (i % print_step == 0):
	print('Iteration %d / %d: loss %f, acc %f' % (i+1, num_iters, loss, train_acc))


	if (verbose>=1):
	print('Done! loss: %f, acc: %f' %(loss, train_acc))

	return {'loss_history': loss_history,
	'acc_history': train_acc_history}

	def predict(self, X):
	"""Use the trained weights of the neural network to determine the class.
	The way this works is that it performs a feedforward propagation to compute
	for the logits, and use the logits to get the max.

	Inputs:
	- X: numpy ndarray of shape (N,D) giving N D-dimensional data points to
	clasify.

	Returns:
	- y_pred: numpy ndarray prediction of shape (N,).
	"""
	z1 = X.dot(self.params['W1']) + self.params['b1']
	a1 = np.tanh(z1)
	z2 = a1.dot(self.params['W2']) + self.params['b2']
	a2 = np.tanh(z2)
	z3 = a2.dot(self.params['W3']) + self.params['b3']
	logits = z3
	y_pred = np.argmax(logits, axis=1)

	return y_pred

	def _tanh_deriv(self,x):
	"""Helper function to compute for the first-derivative of tanh

	Input:
	- x: argument to compute the derivative from.
	"""
	return 1.0 - np.tanh(x)**2