Skip to content

Instantly share code, notes, and snippets.

@sthware
Last active December 22, 2016 07:51
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sthware/c47824c116e6a61a56d9 to your computer and use it in GitHub Desktop.
Save sthware/c47824c116e6a61a56d9 to your computer and use it in GitHub Desktop.
A partially-refactored, variable-name-expanded, heavily commented version of Britz's "Neural Network from scratch" code.
#!/usr/bin/env python
## NAME
## Neural Network from scratch
##
## DESCRIPTION
## A partially-refactored, variable-name-expanded, heavily commented version
## of Denny Britz's code. Original blog post:
## http://www.wildml.com/2015/09/implementing-a-neural-network-from-scratch/
##
## Designed for maximum readibility and concept comprehension. Output
## is a scatterplot image (png format.)
##
## AUTHOR
## Spencer Hoffman <spencer.hoffman@gmail.com>
##
##
import matplotlib
import matplotlib.pyplot as plotter
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
def main():
## The dimensionality of the layers.
layer_dimensions = {
'input': 2,
'output': 2,
'hidden': 3
}
## Distance between points in the output.
POINT_DISTANCE = 0.01
## Padding for the output graphs
ADJUSTMENT_VAL = 0.5
## Gradient descent params
LEARNING_RATE = 0.01
REGULARIZATION_STRENGTH = 0.01
NUM_PASSES = 20000
np.random.seed(0)
## Get some training data.
samples, labels = sklearn.datasets.make_moons(200, noise=0.20)
## Training set size
num_examples = (len(samples))
plotter.scatter(
samples[:,0],
samples[:,1],
s=40,
c=labels,
cmap=plotter.cm.Spectral
)
logistic_regression_classifier = sklearn.linear_model.LogisticRegressionCV()
logistic_regression_classifier.fit(samples, labels)
plot_decision_boundary(
lambda x: logistic_regression_classifier.predict(x),
samples,
labels,
ADJUSTMENT_VAL,
POINT_DISTANCE
)
plotter.savefig('logistic_regression.png')
model = build_model(
layer_dimensions,
num_examples,
samples,
labels,
REGULARIZATION_STRENGTH,
LEARNING_RATE,
NUM_PASSES=NUM_PASSES,
show_loss=True
)
plot_decision_boundary(
lambda x: predict(model, x),
samples,
labels,
ADJUSTMENT_VAL,
POINT_DISTANCE
)
plotter.savefig('final_plot.png')
def calculate_loss(model, samples, labels, num_examples, STRENGTH):
""" Measures our network's learning error
Args:
model: The weights and biases for the network
samples: Training data points
labels: Training data class labels
num_examples: Size of sample set
STRENGTH: The value for regularization to prevent overfitting
Returns:
A floating point value representing our network learning error
"""
probabilities, t = forward_propagate(
model['weights'],
model['biases'],
samples
)
## Calculate the loss using cross-entropy (AKA negative log-likelihood.)
log_probabilities = -np.log(probabilities[range(num_examples), labels])
data_loss = np.sum(log_probabilities)
## Regularize the data loss.
data_loss += (STRENGTH / 2) * (
np.sum(np.square(model['weights']['layer1'])) +
np.sum(np.square(model['weights']['layer2']))
)
return 1.0 / num_examples * data_loss
def plot_decision_boundary(predictor_func, samples, labels, ADJUSTMENT_VAL,
distance):
""" Sets up the plots for the output
Args:
predictor_func: A function we use to get our network's predictions
samples: Training data points
labels: Training data class labels
ADJUSTMENT_VAL: Tweak value for the sample and label values
distance: Space between points in the output
"""
## Set min and max values with some tweaks.
samples_min, samples_max = \
samples[:, 0].min() - ADJUSTMENT_VAL, samples[:, 0].max() + ADJUSTMENT_VAL
labels_min, labels_max = \
samples[:, 1].min() - ADJUSTMENT_VAL, samples[:, 1].max() + ADJUSTMENT_VAL
## Generate a grid of points with some amount of distance between them.
sample_section, label_section = np.meshgrid(
np.arange(samples_min, samples_max, distance),
np.arange(labels_min, labels_max, distance)
)
## Predict the function value for the whole grid.
prediction_value = predictor_func(np.c_[
sample_section.ravel(),
label_section.ravel()
])
## Reshape the prediction array so it displays properly.
prediction_value = prediction_value.reshape(sample_section.shape)
## Draw the decision boundary.
plotter.contourf(
sample_section,
label_section,
prediction_value,
cmap=plotter.cm.Spectral
)
## Draw the points.
plotter.scatter(
samples[:, 0],
samples[:, 1],
c=labels,
cmap=plotter.cm.Spectral
)
def build_model(layer_dimensions, num_examples, samples, labels, STRENGTH,
LEARNING_RATE, NUM_PASSES=20000, show_loss=False):
""" Calculates weights and biases for the vectors/neurons in our network.
Args:
layer_dimensions: number of neurons in each layer
num_examples: Size of sample set
samples: Training data points
labels: Training data class labels
STRENGTH: The value for regularization to prevent overfitting
LEARNING_RATE: AKA step size. Determines how quickly we allow
the network to learn. Low values make convergence happen properly,
but at the cost of speed. High values are faster, but can cause
divergence.
NUM_PASSES: How many times we loop when performing gradient descent
show_loss: Whether or not to show the loss values during the run
Returns:
A dictionary with keys for weights and biases
"""
## Initialize params to random values. We need to learn these.
np.random.seed(0)
## Weights and biases for the vectors/neurons in the hidden layers. We
## set them up to be random values initially.
## Weights describe how important a given input is to its repsective output.
weights = {
'layer1': np.random.randn(layer_dimensions['input'],
layer_dimensions['hidden']) / np.sqrt(layer_dimensions['input']),
'layer2': np.random.randn(
layer_dimensions['hidden'],
layer_dimensions['output']) / np.sqrt(layer_dimensions['hidden'])
}
## Biases represent thresholds for activating a neuron. The higher
## this is, the more likely a neuron is to fire.
biases = {
'layer1': np.zeros((1, layer_dimensions['hidden'])),
'layer2': np.zeros((1, layer_dimensions['output']))
}
## Store the differences between our expectations, and what we've found
## so far. Used to improve our actual weight and bias values.
error_deltas = {
'weights': { 'layer1': None, 'layer2': None },
'biases': { 'layer1': None, 'layer2': None },
'layer2': None,
'layer1': None
}
## Perform backward propagation of errors using gradient descent.
for i in xrange(0, NUM_PASSES):
## First, forward propagate.
current_probabilities, activations = forward_propagate(
weights,
biases,
samples
)
## Continue backpropagation of errors.
## We perform an assignment to make it clear that layer2 starts out
## as the current set of probablities, though we could just set this
## in the call above. We start at the uppermost hidden layer and
## work our way back. Now start calculating the error deltas from
## our hidden layers to help get our weights and biases closer to
## the correct values.
error_deltas['layer2'] = current_probabilities
## We subtract one from the deltas to help us determine how far off
## from the desired value we are. If the values are close to zero (i.e.,
## very small negative numbers), then we are close. If they are
## larger negative numbers, then we are further from said desired value.
error_deltas['layer2'][range(num_examples), labels] -= 1
## The next steps are an implementation of Calculus' chain rule.
error_deltas['weights']['layer2'] = (activations.T).dot(
error_deltas['layer2']
)
error_deltas['biases']['layer2'] = np.sum(
error_deltas['layer2'],
axis=0,
keepdims=True
)
error_deltas['layer1'] = error_deltas['layer2'].dot(
weights['layer2'].T) * (1 - np.power(activations, 2))
error_deltas['weights']['layer1'] = np.dot(
samples.T,
error_deltas['layer1']
)
error_deltas['biases']['layer1'] = np.sum(error_deltas['layer1'], axis=0)
## Regularize weights to combat overfitting. Note that using
## regularization combined with softmax() is effectively using
## Maximum a posteriori estimation (MAP)
error_deltas['weights']['layer2'] += STRENGTH * weights['layer2']
error_deltas['weights']['layer1'] += STRENGTH * weights['layer1']
## Update our weights and biases using the specified learning rate.
weights['layer1'] += -LEARNING_RATE * error_deltas['weights']['layer1']
biases['layer1'] += -LEARNING_RATE * error_deltas['biases']['layer1']
weights['layer2'] += -LEARNING_RATE * error_deltas['weights']['layer2']
biases['layer2'] += -LEARNING_RATE * error_deltas['biases']['layer2']
if show_loss and i % 1000 == 0:
print "Loss after iteration %i: %f" % (i,
calculate_loss(
{'weights': weights, 'biases': biases},
samples,
labels,
num_examples,
STRENGTH
)
)
return {'weights': weights, 'biases': biases}
def predict(model, samples):
""" Predict the highest probabilities. This is the output of the network
Args:
model: The weights and biases for the network
samples: Training data points
Returns: An ndarray of indices indicating the highest current scores.
"""
probabilities, t = forward_propagate(
model['weights'],
model['biases'],
samples
)
return np.argmax(probabilities, axis=1)
def softmax(products):
""" We've already squashed the values in our products vector into the
appropriate range (from -1 to 1, since we're using tanh() as
our non-linearity function.) We now take these values and
create normalized class probabilities. Another way of thinking of
this is that we are performing Maximum Likelihood Estimation (MLE)
Args:
products: Raw scores from our forward propagation step.
Returns: A matrix of floating point numbers representing
normalized class probablities.
"""
## Exponentiation creates un-normalized products.
exponentiated_scores = np.exp(products)
## Division of the sum normalizes them.
return exponentiated_scores / np.sum(
exponentiated_scores,
axis=1,
keepdims=True
)
def forward_propagate(weights, biases, samples):
""" Get the probabilities and activation values of our pattern's
input through the network. Activation values are returned for
times when we need to perform backpropagation.
Args:
weights: The weights for each vector in the layers
biases: The bias values for each neuron in the layers
samples: Training data points
Returns:
A list of probabilities after running through the network,
and activations from the first hidden layer of the network.
"""
## Calculate our inputs.
layer1_products = samples.dot(weights['layer1']) + biases['layer1']
## Call our non-linear activation function; transforms the inputs of a
## layer to its outputs. This is where the neurons of the
## hidden layers do their work.
layer1_activations = np.tanh(layer1_products)
## Now we run the output from the first hidden layer into the second
## hidden layer.
layer2_products = layer1_activations.dot(weights['layer2']) + biases['layer2']
## Finally, we convert the raw scores (which are also activations) we
## receive from the second hidden layer into probabilities using softmax.
probabilities = softmax(layer2_products)
return probabilities, layer1_activations
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment