Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
20150715 Autoencoder
### Autoencoder
###################################
## [1] Parameters & hyperparameters
###################################
# shape of data
N = shape(X)[1] # num observation
D = shape(X)[0] # num features (dimensionality)
## architecture and learning parameters
h = 200 # size of hidden layer
# weight vectors are randomly initialised from a uniform distribution
r = sqrt(6) / sqrt(D + h + 1)
rand = np.random.RandomState()
W1 = np.asarray(rand.uniform(low = -r, high = r, size = (h, D)))
W2 = np.asarray(rand.uniform(low = -r, high = r, size = (D, h)))
# bias vectors are 0s
b1 = np.zeros((h, 1))
b2 = np.zeros((D, 1))
# hyperparameters
_lambda = 0.0003 # regularization strength (1 is max)
rho = 0.1 # sparsity parameters
beta = 0.3
###############################
## [2] Define helper functions
###############################
# sigmoid activation function
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
# unravel theta vector
def unravel(theta):
W1 = theta[0 : D*h].reshape(h, D)
W2 = theta[D*h : 2*D*h].reshape(D, h)
b1 = theta[2*D*h : 2*D*h+h].reshape(h, 1)
b2 = theta[2*D*h+h : 2*D*h+h+D].reshape(D, 1)
return W1, W2, b1, b2
# ravel theta vector
def _ravel(W1, W2, b1, b2):
return numpy.concatenate((W1.flatten(), W2.flatten(), b1.flatten(), b2.flatten()))
############################################################
## [3] Define backpropagation algorithm to train autoencoder
############################################################
def backpropagate(theta):
W1, W2, b1, b2 = unravel(theta)
# Forward propagate to calculate output of hidden layer then output layer
hidden_layer = sigmoid(np.dot(W1, X) + b1)
output_layer = sigmoid(np.dot(W2, hidden_layer) + b2)
# calculate error of outputs
error = output_layer - X
## calculate sparsity penalty term and cost component
rho_hat = np.sum(hidden_layer, axis = 1) / N # average activation value of hidden layer
cost_sparsity = beta * np.sum(rho * np.log(rho / rho_hat) + (1 - rho) * np.log((1 - rho) / (1 - rho_hat)))
delta_sparsity = beta * (-(rho / rho_hat) + ((1 - rho) / (1 - rho_hat)))
## calculate cost
cost_sq_error = 0.5 * np.sum(np.multiply(error, error)) / N
cost_regularisation = 0.5 * _lambda * (np.sum(np.multiply(W1, W1)) + np.sum(np.multiply(W2, W2)))
cost = cost_sq_error + cost_regularisation + cost_sparsity
## backpropagate error
delta_output = np.multiply(error, np.multiply(output_layer, 1 - output_layer))
delta_hidden = np.multiply(np.dot(numpy.transpose(W2), delta_output) + np.transpose(np.matrix(delta_sparsity)),
np.multiply(hidden_layer, 1 - hidden_layer))
## calculate gradient values
W1_grad = np.dot(delta_hidden, np.transpose(X))
W2_grad = np.dot(delta_output, np.transpose(hidden_layer))
b1_grad = np.sum(delta_hidden, axis = 1)
b2_grad = np.sum(delta_output, axis = 1)
## average and apply regularisation
W1_grad = W1_grad / N + _lambda * W1
W2_grad = W2_grad / N + _lambda * W2
b1_grad = b1_grad / N
b2_grad = b2_grad / N
# Ravel weight and bias vectors into a single vector to return to minimise function
theta_grad = _ravel(np.array(W1_grad), np.array(W2_grad), np.array(b1_grad), np.array(b2_grad))
return [cost, theta_grad]
# plot weights
opt_theta = opt_solution.x
opt_W1,_,_,_ = unravel(opt_theta)
weights = [(opt_W1[i,:]/np.sum(opt_W1[i,:]**2)).reshape(sqrt(D), sqrt(D)) for i in xrange(opt_W1.shape[0])]
for i, w in enumerate(weights):
plt.subplot(20, 15, i + 1)
plt.set_cmap('gray')
plt.axis('off')
plt.imshow(w)
plt.subplots_adjust(hspace=0.05, wspace=0.05)
plt.gcf().set_size_inches(15, 20)
## Prepare Python & IPython notebook
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize
import cPickle, gzip
## Prepare dataset
# Import data
# Load the dataset
with gzip.open('mnist.pkl.gz', 'rb') as f:
train_set, valid_set, test_set = cPickle.load(f)
# grab the training data
X = train_set[0]
## pre-processing
## standardise & normalise
X -= np.mean(X) # subtract mean
xstd = np.std(X)
# standardise - divide by SD = scales to [-1,1]
X /= xstd
# rescale from [-1,1] to [0.1, 0.9] which is optimal for sigmoid function
rng = X.max(axis=1, keepdims=True) - X.min(axis=1, keepdims=True)
X = 0.9 - (((0.9 - 0.1) * (X.max(axis=1, keepdims=True) - X)) / rng)
# transpose to align with our design
X = X.T
# combine weights and betas into a single vector for optimisation
theta = _ravel(W1, W2, b1, b2)
# Train!
opt_solution = scipy.optimize.minimize(backpropagate, theta, method = 'L-BFGS-B', jac = True, options = {'maxiter': 1000})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.