standarderror/Autoencoder.py

## Autoencoder.py
### Autoencoder

###################################
## [1] Parameters & hyperparameters
###################################

# shape of data
N = shape(X)[1]  # num observation
D = shape(X)[0]  # num features (dimensionality)

## architecture and learning parameters
h = 200 # size of hidden layer

# weight vectors are randomly initialised from a uniform distribution
r = sqrt(6) / sqrt(D + h + 1)
rand = np.random.RandomState()

W1 = np.asarray(rand.uniform(low = -r, high = r, size = (h, D)))
W2 = np.asarray(rand.uniform(low = -r, high = r, size = (D, h)))

# bias vectors are 0s
b1 = np.zeros((h, 1))
b2 = np.zeros((D, 1))

# hyperparameters
_lambda = 0.0003 # regularization strength (1 is max)
rho = 0.1        # sparsity parameters
beta = 0.3


###############################
## [2] Define helper functions
###############################

# sigmoid activation function
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

# unravel theta vector
def unravel(theta):
    W1 = theta[0 : D*h].reshape(h, D)
    W2 = theta[D*h : 2*D*h].reshape(D, h)
    b1 = theta[2*D*h : 2*D*h+h].reshape(h, 1)
    b2 = theta[2*D*h+h : 2*D*h+h+D].reshape(D, 1)
    return W1, W2, b1, b2

# ravel theta vector
def _ravel(W1, W2, b1, b2):
    return numpy.concatenate((W1.flatten(), W2.flatten(), b1.flatten(), b2.flatten()))


############################################################
## [3] Define backpropagation algorithm to train autoencoder
############################################################

def backpropagate(theta):
    W1, W2, b1, b2 = unravel(theta)

    # Forward propagate to calculate output of hidden layer then output layer
    hidden_layer = sigmoid(np.dot(W1, X) + b1)
    output_layer = sigmoid(np.dot(W2, hidden_layer) + b2)

    # calculate error of outputs
    error = output_layer - X

    ## calculate sparsity penalty term and cost component
    rho_hat = np.sum(hidden_layer, axis = 1) / N   # average activation value of hidden layer
    cost_sparsity = beta * np.sum(rho * np.log(rho / rho_hat) + (1 - rho) * np.log((1 - rho) / (1 - rho_hat)))
    delta_sparsity = beta * (-(rho / rho_hat) + ((1 - rho) / (1 - rho_hat)))

    ## calculate cost
    cost_sq_error = 0.5 * np.sum(np.multiply(error, error)) / N
    cost_regularisation = 0.5 * _lambda * (np.sum(np.multiply(W1, W1)) + np.sum(np.multiply(W2, W2)))
    cost = cost_sq_error + cost_regularisation + cost_sparsity


    ## backpropagate error
    delta_output = np.multiply(error, np.multiply(output_layer, 1 - output_layer))
    delta_hidden = np.multiply(np.dot(numpy.transpose(W2), delta_output) + np.transpose(np.matrix(delta_sparsity)),
                             np.multiply(hidden_layer, 1 - hidden_layer))

    ## calculate gradient values
    W1_grad = np.dot(delta_hidden, np.transpose(X))
    W2_grad = np.dot(delta_output, np.transpose(hidden_layer))
    b1_grad = np.sum(delta_hidden, axis = 1)
    b2_grad = np.sum(delta_output, axis = 1)

    ## average and apply regularisation
    W1_grad = W1_grad / N + _lambda * W1
    W2_grad = W2_grad / N + _lambda * W2
    b1_grad = b1_grad / N
    b2_grad = b2_grad / N

    # Ravel weight and bias vectors into a single vector to return to minimise function
    theta_grad = _ravel(np.array(W1_grad), np.array(W2_grad), np.array(b1_grad), np.array(b2_grad))

    return [cost, theta_grad]


## plot weights.py
# plot weights
opt_theta = opt_solution.x
opt_W1,_,_,_  = unravel(opt_theta)
weights = [(opt_W1[i,:]/np.sum(opt_W1[i,:]**2)).reshape(sqrt(D), sqrt(D)) for i in xrange(opt_W1.shape[0])]

for i, w in enumerate(weights):
    plt.subplot(20, 15, i + 1)
    plt.set_cmap('gray')
    plt.axis('off')
    plt.imshow(w)
plt.subplots_adjust(hspace=0.05, wspace=0.05)
plt.gcf().set_size_inches(15, 20)


## preprocessing.py
## Prepare Python & IPython notebook
%pylab inline

import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize
import cPickle, gzip

## Prepare dataset
# Import data

# Load the dataset
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = cPickle.load(f)

# grab the training data
X = train_set[0]

## pre-processing
## standardise & normalise
X -= np.mean(X)   # subtract mean
xstd = np.std(X)
# standardise - divide by SD = scales to [-1,1]
X /= xstd

# rescale from [-1,1] to [0.1, 0.9] which is optimal for sigmoid function
rng = X.max(axis=1, keepdims=True) - X.min(axis=1, keepdims=True)
X = 0.9 - (((0.9 - 0.1) * (X.max(axis=1, keepdims=True) - X)) / rng)

# transpose to align with our design
X = X.T

## train.py
# combine weights and betas into a single vector for optimisation
theta = _ravel(W1, W2, b1, b2)

# Train!
opt_solution = scipy.optimize.minimize(backpropagate, theta, method = 'L-BFGS-B', jac = True, options = {'maxiter': 1000})
	### Autoencoder

	###################################
	## [1] Parameters & hyperparameters
	###################################

	# shape of data
	N = shape(X)[1] # num observation
	D = shape(X)[0] # num features (dimensionality)

	## architecture and learning parameters
	h = 200 # size of hidden layer

	# weight vectors are randomly initialised from a uniform distribution
	r = sqrt(6) / sqrt(D + h + 1)
	rand = np.random.RandomState()

	W1 = np.asarray(rand.uniform(low = -r, high = r, size = (h, D)))
	W2 = np.asarray(rand.uniform(low = -r, high = r, size = (D, h)))

	# bias vectors are 0s
	b1 = np.zeros((h, 1))
	b2 = np.zeros((D, 1))

	# hyperparameters
	_lambda = 0.0003 # regularization strength (1 is max)
	rho = 0.1 # sparsity parameters
	beta = 0.3


	###############################
	## [2] Define helper functions
	###############################

	# sigmoid activation function
	def sigmoid(x):
	return 1.0 / (1 + np.exp(-x))

	# unravel theta vector
	def unravel(theta):
	W1 = theta[0 : D*h].reshape(h, D)
	W2 = theta[Dh : 2D*h].reshape(D, h)
	b1 = theta[2Dh : 2Dh+h].reshape(h, 1)
	b2 = theta[2Dh+h : 2Dh+h+D].reshape(D, 1)
	return W1, W2, b1, b2

	# ravel theta vector
	def _ravel(W1, W2, b1, b2):
	return numpy.concatenate((W1.flatten(), W2.flatten(), b1.flatten(), b2.flatten()))


	############################################################
	## [3] Define backpropagation algorithm to train autoencoder
	############################################################

	def backpropagate(theta):
	W1, W2, b1, b2 = unravel(theta)

	# Forward propagate to calculate output of hidden layer then output layer
	hidden_layer = sigmoid(np.dot(W1, X) + b1)
	output_layer = sigmoid(np.dot(W2, hidden_layer) + b2)

	# calculate error of outputs
	error = output_layer - X

	## calculate sparsity penalty term and cost component
	rho_hat = np.sum(hidden_layer, axis = 1) / N # average activation value of hidden layer
	cost_sparsity = beta * np.sum(rho * np.log(rho / rho_hat) + (1 - rho) * np.log((1 - rho) / (1 - rho_hat)))
	delta_sparsity = beta * (-(rho / rho_hat) + ((1 - rho) / (1 - rho_hat)))

	## calculate cost
	cost_sq_error = 0.5 * np.sum(np.multiply(error, error)) / N
	cost_regularisation = 0.5 * _lambda * (np.sum(np.multiply(W1, W1)) + np.sum(np.multiply(W2, W2)))
	cost = cost_sq_error + cost_regularisation + cost_sparsity


	## backpropagate error
	delta_output = np.multiply(error, np.multiply(output_layer, 1 - output_layer))
	delta_hidden = np.multiply(np.dot(numpy.transpose(W2), delta_output) + np.transpose(np.matrix(delta_sparsity)),
	np.multiply(hidden_layer, 1 - hidden_layer))

	## calculate gradient values
	W1_grad = np.dot(delta_hidden, np.transpose(X))
	W2_grad = np.dot(delta_output, np.transpose(hidden_layer))
	b1_grad = np.sum(delta_hidden, axis = 1)
	b2_grad = np.sum(delta_output, axis = 1)

	## average and apply regularisation
	W1_grad = W1_grad / N + _lambda * W1
	W2_grad = W2_grad / N + _lambda * W2
	b1_grad = b1_grad / N
	b2_grad = b2_grad / N

	# Ravel weight and bias vectors into a single vector to return to minimise function
	theta_grad = _ravel(np.array(W1_grad), np.array(W2_grad), np.array(b1_grad), np.array(b2_grad))

	return [cost, theta_grad]
	# plot weights
	opt_theta = opt_solution.x
	opt_W1,_,_,_ = unravel(opt_theta)
	weights = [(opt_W1[i,:]/np.sum(opt_W1[i,:]**2)).reshape(sqrt(D), sqrt(D)) for i in xrange(opt_W1.shape[0])]

	for i, w in enumerate(weights):
	plt.subplot(20, 15, i + 1)
	plt.set_cmap('gray')
	plt.axis('off')
	plt.imshow(w)
	plt.subplots_adjust(hspace=0.05, wspace=0.05)
	plt.gcf().set_size_inches(15, 20)
	## Prepare Python & IPython notebook
	%pylab inline

	import numpy as np
	import matplotlib.pyplot as plt
	import scipy.optimize
	import cPickle, gzip

	## Prepare dataset
	# Import data

	# Load the dataset
	with gzip.open('mnist.pkl.gz', 'rb') as f:
	train_set, valid_set, test_set = cPickle.load(f)

	# grab the training data
	X = train_set[0]

	## pre-processing
	## standardise & normalise
	X -= np.mean(X) # subtract mean
	xstd = np.std(X)
	# standardise - divide by SD = scales to [-1,1]
	X /= xstd

	# rescale from [-1,1] to [0.1, 0.9] which is optimal for sigmoid function
	rng = X.max(axis=1, keepdims=True) - X.min(axis=1, keepdims=True)
	X = 0.9 - (((0.9 - 0.1) * (X.max(axis=1, keepdims=True) - X)) / rng)

	# transpose to align with our design
	X = X.T
	# combine weights and betas into a single vector for optimisation
	theta = _ravel(W1, W2, b1, b2)

	# Train!
	opt_solution = scipy.optimize.minimize(backpropagate, theta, method = 'L-BFGS-B', jac = True, options = {'maxiter': 1000})