Last active
October 8, 2018 11:46
-
-
Save hcl14/4bc5b5d3fbcb780b2dd594bb34b8c8ad to your computer and use it in GitHub Desktop.
Something like "Forward thinking": https://arxiv.org/pdf/1706.02480.pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Layer-wise training neural network with second order (Newton). | |
# New layer is added on each iteration and optimized with Newton Method. | |
# And example of Tensorflow eager execution | |
# Combines gradiets, hessian, and call to Optimizer | |
# Might contain logical errors though, so think yourself when adapting this code | |
# Newton's method in Tensorflow | |
# WARNING! This code is memory and computationally intensive, better run it on GPU | |
# having bigger dimensionality increases computing time significantly | |
# Also, you can probably remove line 159 (hessian fixing) if you use PCA | |
# 'Vanilla' N.m. intended to work when loss function to be optimized is convex. | |
# One-layer linear network without activation is convex. | |
# If activation function is monotonic, the error surface associated with a single-layer model is convex. | |
# In other cases, Hessian will have negative eigenvalues in saddle points and other non-convex places of the surface | |
# To fix that, you can try different methods. One of those approaches is to do eigendecomposition of H and invert negative eigenvalues, | |
# making H "pushing out" in those directions, as described in this paper: Identifying and attacking the saddle point problem in high-dimensional non-convex optimization (https://papers.nips.cc/paper/5486-identifying-and-attacking-the-saddle-point-problem-in-high-dimensional-non-convex-optimization.pdf) | |
# This example uses one-layer network with leaky relu activation (activation is just for fun). | |
# Notice, that for one-layer network Hessian matrix has multidiagonal structure, | |
# so in theory you can reduce computations by not computing derivatives which does not exist in single-layer NN | |
# See also this simple script: | |
# https://gist.github.com/guillaume-chevalier/6b01c4e43a123abf8db69fa97532993f | |
import os | |
import tensorflow as tf | |
import tensorflow.contrib.eager as tfe | |
import numpy as np | |
from sklearn import datasets | |
import random | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # supress tensorflow verbosity | |
tf.enable_eager_execution() | |
#### | |
# 1. Training data, sklearn digits dataset | |
#### | |
def data(): | |
# import some data to play with | |
digits = datasets.load_digits() | |
x = digits.data | |
y = digits.target | |
# digits are 8x8 images = 64 dimensions. Hessian will be big, so I reduce dimensionality | |
# PCA compression to dim 10 | |
from sklearn.decomposition import PCA | |
pca = PCA(n_components=10) | |
x = pca.fit_transform(x) | |
# shuffle data | |
import random | |
ntrain = x.shape[0] | |
arrayindices = list(range(ntrain)) | |
random.shuffle(arrayindices) | |
x = x[arrayindices] | |
y = y[arrayindices] | |
# one-hot encoder | |
target_vector = y | |
n_labels = x.shape[1] # square matrices #np.max(y) | |
y = np.equal.outer(target_vector, np.arange(n_labels)).astype(np.float) | |
n_test = 100 | |
trainx = x[:-n_test] | |
train_labels = y[:-n_test] | |
testx = x[-n_test:] | |
test_labels = y[-n_test:] | |
return trainx, train_labels, testx, test_labels | |
# train and test data | |
train_datax, train_datay, test_datax, test_datay = data() | |
print("Train data shape:{}, Train labels shape: {}".format(train_datax.shape,train_datay.shape)) | |
print("Test data shape:{}, Test labels shape: {}".format(test_datax.shape,test_datay.shape)) | |
#### | |
# 2. Feed data to tensorflow | |
#### | |
# Weights (all in one vector - essential) and bias for linear model | |
# weight matrix dimensions are input*output, bias: output | |
shape_w = [train_datax.shape[1], train_datay.shape[1]] | |
shape_b = [train_datay.shape[1]] | |
num_variables = shape_w[0]*shape_w[1] + shape_b[0] | |
# Delta is an increment to weights vector | |
stored_delta = tfe.Variable(tf.ones(shape=[num_variables]), dtype=tf.float32, name="d") | |
# learning rate (need to be variable to use optimizer) | |
alpha = tfe.Variable(1.0, dtype=tf.float32, name="alpha") | |
# single train pass for eager execution | |
def model(x, y, weights): | |
# need for gradient and types match | |
x = tf.convert_to_tensor(x,dtype=tf.float32) | |
y = tf.convert_to_tensor(y,dtype=tf.float32) | |
# function to get matrices from weight vector | |
def vec_to_variables(vec): | |
W = tf.reshape(vec[:shape_w[0]*shape_w[1]], shape_w) | |
# This is for generosity, just to show how the data is packed | |
b = tf.reshape(vec[shape_w[0]*shape_w[1]:(shape_w[0]*shape_w[1] + shape_b[0])], shape_b) | |
return W, b | |
#### | |
# 3. Training code | |
#### | |
# Making a prediction | |
def predict(W, b): | |
signal = x | |
# propagate throuhg frozen layers | |
for layer in weights[:-1]: | |
Wi, bi = vec_to_variables(layer) | |
signal = tf.matmul(signal,Wi) + bi #linear model | |
signal = tf.nn.leaky_relu(signal, alpha=0.3) | |
# and new last active layer | |
signal = tf.matmul(signal,W) + b #linear model | |
signal = tf.nn.leaky_relu(signal, alpha=0.3) | |
return signal | |
# and comparing it to the true output | |
def compute_loss(labels, predicts): | |
return tf.reduce_mean(0.5 * (labels - predicts)**2) | |
def evaluate_accuracy(pred, y): | |
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) | |
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) | |
return accuracy | |
#with tf.GradientTape(persistent=True) as tape: | |
# current working weights are the last on the list of stored weights | |
model_vars = weights[-1] | |
def func_to_diff(model_vars): | |
W, b = vec_to_variables(model_vars) | |
pred = predict(W,b) | |
loss = compute_loss(y, pred) | |
return loss | |
loss = func_to_diff(model_vars) | |
print("Loss before optimization:", loss.numpy()) | |
# inefficient, yes | |
W, b = vec_to_variables(model_vars) | |
pred = predict(W,b) | |
print("Accuracy before optimization:", evaluate_accuracy(pred, y).numpy()) | |
# Model gradients vector | |
def gradient_function(model_vars): | |
grad_fn = tfe.gradients_function(func_to_diff) | |
grads = grad_fn(model_vars)[0] | |
return grads | |
grads = gradient_function(model_vars) | |
# Newton 2nd order weights update rule. Learning rate is found using line search | |
# https://stackoverflow.com/questions/35266370/tensorflow-compute-hessian-matrix-and-higher-order-derivatives#comment78202919_37666032 | |
# Use tf.hessians, which will compute the portion of the Hessian relating to each variable in vars (so long as each variable is a vector). If you want the FULL Hessian (including all pairwise interactions between variables), you'll need to to start with a single super-vector containing every variable you care about, then slice from there | |
def hessian_function(gradient_function, model_vars): | |
hess = [] | |
for idx, variable in enumerate(model_vars): | |
# take specific gradient component | |
def grad_fi_function(model_vars): | |
grads = gradient_function(model_vars) | |
return grads[idx] | |
grads_fi = tfe.gradients_function(grad_fi_function) | |
grads_i = grads_fi(model_vars)[0] | |
hess.append(grads_i.numpy()) # hessian is symmetric, don't need to care here | |
return tf.convert_to_tensor(hess,dtype=tf.float32) | |
hess = hessian_function(gradient_function, model_vars) | |
# hessian fixing: make hessian always invertible | |
hess = hess + 0.01*tf.eye(tf.shape(hess)[0]) | |
# compute inverse hessian | |
inv_hess = tf.matrix_inverse(hess) | |
# Compute increment which updates weigh vector: W1 = W - alpha*H^(-1)*G | |
# delta := H^(-1)*G | |
delta = tf.transpose(tf.matmul(inv_hess, tf.expand_dims(grads,-1)))[0] | |
# perform line search for the best alpha | |
opt = tf.train.AdamOptimizer(learning_rate=1.0) #SGD explodes somewhy | |
for i in range(50): | |
# create function which depends on alpha with delta fixed | |
with tf.GradientTape() as tape: | |
candidate_vars = model_vars - alpha*delta | |
W1, b1 = vec_to_variables(candidate_vars) | |
loss_alpha = compute_loss(y, predict(W1,b1)) | |
alpha_grad = tape.gradient(loss_alpha, [alpha])[0] | |
# line search: find optimal value for alpha | |
opt.apply_gradients([(alpha_grad, alpha)], global_step=tf.train.get_or_create_global_step()) | |
# compute alpha and store delta | |
stored_delta.assign(delta) | |
# update model weights | |
model_vars_new = model_vars - alpha*stored_delta | |
# update last active layer and add new identity layer to be trained on the next iteration | |
identity_layer = tf.concat([tf.reshape(tf.eye(shape_w[0], num_columns=shape_b[0], dtype=tf.float32), [-1]), tf.zeros(shape_b[0], dtype=tf.float32)], axis = 0) | |
weights_new = tf.concat([weights[:-1], [model_vars_new], [identity_layer]], axis=0) # pack along first dim | |
# compute new loss | |
W, b = vec_to_variables(model_vars_new) | |
pred = predict(W,b) | |
loss = compute_loss(y, pred) | |
return hess, alpha.numpy(), loss.numpy(), evaluate_accuracy(pred, y).numpy(), weights_new.numpy() | |
#### | |
# 4. Main loop to fit and evaluate the model | |
#### | |
weights = tf.convert_to_tensor([tf.ones(shape=[num_variables], dtype=tf.float32)]) | |
for iteration in range(10): | |
# set learning rate value from which line search is started | |
alpha.assign(1.0) | |
# I'm cautious and separate those operations, | |
# as I want hess and alpha to be computed first | |
hessian, final_alpha, new_loss, accuracy, weights = model(train_datax,train_datay, weights) | |
# mode has 650 parameters: 64x10 weight matrix and 10 biases | |
print("Hessian shape: {}".format(hessian.shape)) | |
print("Hessian morm: {}".format(np.linalg.norm(hessian))) | |
print("Alpha{}: {}".format(iteration, final_alpha)) | |
print("Loss after iteration {}: {}".format(iteration, new_loss)) | |
print("Train accuracy: %.2f %% " % (accuracy*100.0)) | |
print("------------------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment