hcl14/tf_second_order_incremental.py

## tf_second_order_incremental.py
# Layer-wise training neural network with second order (Newton).
# New layer is added on each iteration and optimized with Newton Method.
# And example of Tensorflow eager execution
# Combines gradiets, hessian, and call to Optimizer

# Might contain logical errors though, so think yourself when adapting this code


# Newton's method in Tensorflow

# WARNING! This code is memory and computationally intensive, better run it on GPU
# having bigger dimensionality increases computing time significantly
# Also, you can probably remove line 159 (hessian fixing) if you use PCA

# 'Vanilla' N.m. intended to work when loss function to be optimized is convex.
# One-layer linear network without activation is convex.
# If activation function is monotonic, the error surface associated with a single-layer model is convex.

# In other cases, Hessian will have negative eigenvalues in saddle points and other non-convex places of the surface
# To fix that, you can try different methods. One of those approaches is to do eigendecomposition of H and invert negative eigenvalues,
# making H "pushing out" in those directions, as described in this paper: Identifying and attacking the saddle point problem in high-dimensional non-convex optimization (https://papers.nips.cc/paper/5486-identifying-and-attacking-the-saddle-point-problem-in-high-dimensional-non-convex-optimization.pdf)

# This example uses one-layer network with leaky relu activation (activation is just for fun).

# Notice, that for one-layer network Hessian matrix has multidiagonal structure,
# so in theory you can reduce computations by not computing derivatives which does not exist in single-layer NN

# See also this simple script:
# https://gist.github.com/guillaume-chevalier/6b01c4e43a123abf8db69fa97532993f

import os
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
from sklearn import datasets
import random


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # supress tensorflow verbosity

tf.enable_eager_execution()

####
# 1. Training data, sklearn digits dataset
####

def data():


    # import some data to play with
    digits = datasets.load_digits()
    x = digits.data
    y = digits.target
    # digits are 8x8 images = 64 dimensions. Hessian will be big, so I reduce dimensionality
    # PCA compression to dim 10
    from sklearn.decomposition import PCA

    pca = PCA(n_components=10)

    x = pca.fit_transform(x)


    # shuffle data
    import random
    ntrain = x.shape[0]
    arrayindices = list(range(ntrain))
    random.shuffle(arrayindices)

    x = x[arrayindices]
    y = y[arrayindices]


    # one-hot encoder
    target_vector = y
    n_labels = x.shape[1] # square matrices #np.max(y)

    y = np.equal.outer(target_vector, np.arange(n_labels)).astype(np.float)

    n_test = 100

    trainx = x[:-n_test]
    train_labels = y[:-n_test]


    testx = x[-n_test:]
    test_labels = y[-n_test:]


    return trainx, train_labels, testx, test_labels


# train and test data
train_datax, train_datay, test_datax, test_datay = data()

print("Train data shape:{}, Train labels shape: {}".format(train_datax.shape,train_datay.shape))
print("Test data shape:{}, Test labels shape: {}".format(test_datax.shape,test_datay.shape))

####
# 2. Feed data to tensorflow
####


# Weights (all in one vector - essential) and bias for linear model
# weight matrix dimensions are input*output, bias: output
shape_w = [train_datax.shape[1], train_datay.shape[1]]
shape_b = [train_datay.shape[1]]

num_variables = shape_w[0]*shape_w[1] + shape_b[0]


# Delta is an increment to weights vector
stored_delta = tfe.Variable(tf.ones(shape=[num_variables]), dtype=tf.float32, name="d")


# learning rate (need to be variable to use optimizer)
alpha = tfe.Variable(1.0, dtype=tf.float32, name="alpha")


# single train pass for eager execution
def model(x, y, weights):

    # need for gradient and types match
    x = tf.convert_to_tensor(x,dtype=tf.float32)
    y = tf.convert_to_tensor(y,dtype=tf.float32)


    # function to get matrices from weight vector
    def vec_to_variables(vec):

        W = tf.reshape(vec[:shape_w[0]*shape_w[1]], shape_w)
        # This is for generosity, just to show how the data is packed
        b = tf.reshape(vec[shape_w[0]*shape_w[1]:(shape_w[0]*shape_w[1] + shape_b[0])], shape_b)
        return W, b


    ####
    # 3. Training code
    ####


    # Making a prediction
    def predict(W, b):

        signal = x
        # propagate throuhg frozen layers
        for layer in weights[:-1]:

            Wi, bi = vec_to_variables(layer)

            signal = tf.matmul(signal,Wi) + bi #linear model
            signal = tf.nn.leaky_relu(signal, alpha=0.3)

        # and new last active layer
        signal = tf.matmul(signal,W) + b #linear model
        signal = tf.nn.leaky_relu(signal, alpha=0.3)

        return signal

    # and comparing it to the true output
    def compute_loss(labels, predicts):
        return tf.reduce_mean(0.5 * (labels - predicts)**2)


    def evaluate_accuracy(pred, y):
        correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        return accuracy


    #with tf.GradientTape(persistent=True) as tape:
    # current working weights are the last on the list of stored weights
    model_vars = weights[-1]

    def func_to_diff(model_vars):
        W, b = vec_to_variables(model_vars)
        pred = predict(W,b)
        loss = compute_loss(y, pred)
        return loss

    loss = func_to_diff(model_vars)
    print("Loss before optimization:", loss.numpy())

    # inefficient, yes
    W, b = vec_to_variables(model_vars)
    pred = predict(W,b)
    print("Accuracy before optimization:", evaluate_accuracy(pred, y).numpy())

    # Model gradients vector

    def gradient_function(model_vars):
        grad_fn = tfe.gradients_function(func_to_diff)
        grads = grad_fn(model_vars)[0]
        return grads

    grads = gradient_function(model_vars)

    # Newton 2nd order weights update rule. Learning rate is found using line search

    # https://stackoverflow.com/questions/35266370/tensorflow-compute-hessian-matrix-and-higher-order-derivatives#comment78202919_37666032
    # Use tf.hessians, which will compute the portion of the Hessian relating to each variable in vars (so long as each variable is a vector). If you want the FULL Hessian (including all pairwise interactions between variables), you'll need to to start with a single super-vector containing every variable you care about, then slice from there

    def hessian_function(gradient_function, model_vars):
        hess = []
        for idx, variable in enumerate(model_vars):
            # take specific gradient component
            def grad_fi_function(model_vars):
                grads = gradient_function(model_vars)
                return grads[idx]

            grads_fi = tfe.gradients_function(grad_fi_function)
            grads_i = grads_fi(model_vars)[0]
            hess.append(grads_i.numpy()) # hessian is symmetric, don't need to care here

        return tf.convert_to_tensor(hess,dtype=tf.float32)

    hess = hessian_function(gradient_function, model_vars)

    # hessian fixing: make hessian always invertible
    hess = hess + 0.01*tf.eye(tf.shape(hess)[0])

    # compute inverse hessian
    inv_hess = tf.matrix_inverse(hess)
    # Compute increment which updates weigh vector: W1 = W - alpha*H^(-1)*G
    # delta := H^(-1)*G
    delta = tf.transpose(tf.matmul(inv_hess, tf.expand_dims(grads,-1)))[0]


    # perform line search for the best alpha
    opt = tf.train.AdamOptimizer(learning_rate=1.0) #SGD explodes somewhy
    for i in range(50):
        # create function which depends on alpha with delta fixed
        with tf.GradientTape() as tape:
            candidate_vars = model_vars - alpha*delta
            W1, b1 = vec_to_variables(candidate_vars)
            loss_alpha = compute_loss(y, predict(W1,b1))

        alpha_grad = tape.gradient(loss_alpha, [alpha])[0]

        # line search: find optimal value for alpha
        opt.apply_gradients([(alpha_grad, alpha)], global_step=tf.train.get_or_create_global_step())


    # compute alpha and store delta
    stored_delta.assign(delta)

    # update model weights
    model_vars_new = model_vars - alpha*stored_delta

    # update last active layer and add new identity layer to be trained on the next iteration

    identity_layer = tf.concat([tf.reshape(tf.eye(shape_w[0], num_columns=shape_b[0], dtype=tf.float32), [-1]), tf.zeros(shape_b[0], dtype=tf.float32)], axis = 0)


    weights_new = tf.concat([weights[:-1], [model_vars_new], [identity_layer]], axis=0) # pack along first dim

    # compute new loss
    W, b = vec_to_variables(model_vars_new)
    pred = predict(W,b)
    loss = compute_loss(y, pred)

    return hess, alpha.numpy(), loss.numpy(), evaluate_accuracy(pred, y).numpy(), weights_new.numpy()


####
# 4. Main loop to fit and evaluate the model
####


weights = tf.convert_to_tensor([tf.ones(shape=[num_variables], dtype=tf.float32)])


for iteration in range(10):

    # set learning rate value from which line search is started
    alpha.assign(1.0)

    # I'm cautious and separate those operations,
    # as I want hess and alpha to be computed first
    hessian, final_alpha, new_loss, accuracy, weights = model(train_datax,train_datay, weights)

    # mode has 650 parameters: 64x10 weight matrix and 10 biases
    print("Hessian shape: {}".format(hessian.shape))
    print("Hessian morm: {}".format(np.linalg.norm(hessian)))

    print("Alpha{}: {}".format(iteration, final_alpha))


    print("Loss after iteration {}: {}".format(iteration, new_loss))

    print("Train accuracy: %.2f %% " % (accuracy*100.0))
    print("------------------------")
	# Layer-wise training neural network with second order (Newton).
	# New layer is added on each iteration and optimized with Newton Method.
	# And example of Tensorflow eager execution
	# Combines gradiets, hessian, and call to Optimizer

	# Might contain logical errors though, so think yourself when adapting this code



	# Newton's method in Tensorflow

	# WARNING! This code is memory and computationally intensive, better run it on GPU
	# having bigger dimensionality increases computing time significantly
	# Also, you can probably remove line 159 (hessian fixing) if you use PCA

	# 'Vanilla' N.m. intended to work when loss function to be optimized is convex.
	# One-layer linear network without activation is convex.
	# If activation function is monotonic, the error surface associated with a single-layer model is convex.

	# In other cases, Hessian will have negative eigenvalues in saddle points and other non-convex places of the surface
	# To fix that, you can try different methods. One of those approaches is to do eigendecomposition of H and invert negative eigenvalues,
	# making H "pushing out" in those directions, as described in this paper: Identifying and attacking the saddle point problem in high-dimensional non-convex optimization (https://papers.nips.cc/paper/5486-identifying-and-attacking-the-saddle-point-problem-in-high-dimensional-non-convex-optimization.pdf)

	# This example uses one-layer network with leaky relu activation (activation is just for fun).

	# Notice, that for one-layer network Hessian matrix has multidiagonal structure,
	# so in theory you can reduce computations by not computing derivatives which does not exist in single-layer NN

	# See also this simple script:
	# https://gist.github.com/guillaume-chevalier/6b01c4e43a123abf8db69fa97532993f

	import os
	import tensorflow as tf
	import tensorflow.contrib.eager as tfe
	import numpy as np
	from sklearn import datasets
	import random


	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # supress tensorflow verbosity

	tf.enable_eager_execution()

	####
	# 1. Training data, sklearn digits dataset
	####

	def data():


	# import some data to play with
	digits = datasets.load_digits()
	x = digits.data
	y = digits.target
	# digits are 8x8 images = 64 dimensions. Hessian will be big, so I reduce dimensionality
	# PCA compression to dim 10
	from sklearn.decomposition import PCA

	pca = PCA(n_components=10)

	x = pca.fit_transform(x)



	# shuffle data
	import random
	ntrain = x.shape[0]
	arrayindices = list(range(ntrain))
	random.shuffle(arrayindices)

	x = x[arrayindices]
	y = y[arrayindices]


	# one-hot encoder
	target_vector = y
	n_labels = x.shape[1] # square matrices #np.max(y)

	y = np.equal.outer(target_vector, np.arange(n_labels)).astype(np.float)

	n_test = 100

	trainx = x[:-n_test]
	train_labels = y[:-n_test]


	testx = x[-n_test:]
	test_labels = y[-n_test:]



	return trainx, train_labels, testx, test_labels



	# train and test data
	train_datax, train_datay, test_datax, test_datay = data()

	print("Train data shape:{}, Train labels shape: {}".format(train_datax.shape,train_datay.shape))
	print("Test data shape:{}, Test labels shape: {}".format(test_datax.shape,test_datay.shape))

	####
	# 2. Feed data to tensorflow
	####


	# Weights (all in one vector - essential) and bias for linear model
	# weight matrix dimensions are input*output, bias: output
	shape_w = [train_datax.shape[1], train_datay.shape[1]]
	shape_b = [train_datay.shape[1]]

	num_variables = shape_w[0]*shape_w[1] + shape_b[0]


	# Delta is an increment to weights vector
	stored_delta = tfe.Variable(tf.ones(shape=[num_variables]), dtype=tf.float32, name="d")


	# learning rate (need to be variable to use optimizer)
	alpha = tfe.Variable(1.0, dtype=tf.float32, name="alpha")


	# single train pass for eager execution
	def model(x, y, weights):

	# need for gradient and types match
	x = tf.convert_to_tensor(x,dtype=tf.float32)
	y = tf.convert_to_tensor(y,dtype=tf.float32)


	# function to get matrices from weight vector
	def vec_to_variables(vec):

	W = tf.reshape(vec[:shape_w[0]*shape_w[1]], shape_w)
	# This is for generosity, just to show how the data is packed
	b = tf.reshape(vec[shape_w[0]shape_w[1]:(shape_w[0]shape_w[1] + shape_b[0])], shape_b)
	return W, b


	####
	# 3. Training code
	####


	# Making a prediction
	def predict(W, b):

	signal = x
	# propagate throuhg frozen layers
	for layer in weights[:-1]:

	Wi, bi = vec_to_variables(layer)

	signal = tf.matmul(signal,Wi) + bi #linear model
	signal = tf.nn.leaky_relu(signal, alpha=0.3)

	# and new last active layer
	signal = tf.matmul(signal,W) + b #linear model
	signal = tf.nn.leaky_relu(signal, alpha=0.3)

	return signal

	# and comparing it to the true output
	def compute_loss(labels, predicts):
	return tf.reduce_mean(0.5 * (labels - predicts)**2)


	def evaluate_accuracy(pred, y):
	correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
	accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
	return accuracy



	#with tf.GradientTape(persistent=True) as tape:
	# current working weights are the last on the list of stored weights
	model_vars = weights[-1]

	def func_to_diff(model_vars):
	W, b = vec_to_variables(model_vars)
	pred = predict(W,b)
	loss = compute_loss(y, pred)
	return loss

	loss = func_to_diff(model_vars)
	print("Loss before optimization:", loss.numpy())

	# inefficient, yes
	W, b = vec_to_variables(model_vars)
	pred = predict(W,b)
	print("Accuracy before optimization:", evaluate_accuracy(pred, y).numpy())

	# Model gradients vector

	def gradient_function(model_vars):
	grad_fn = tfe.gradients_function(func_to_diff)
	grads = grad_fn(model_vars)[0]
	return grads

	grads = gradient_function(model_vars)

	# Newton 2nd order weights update rule. Learning rate is found using line search

	# https://stackoverflow.com/questions/35266370/tensorflow-compute-hessian-matrix-and-higher-order-derivatives#comment78202919_37666032
	# Use tf.hessians, which will compute the portion of the Hessian relating to each variable in vars (so long as each variable is a vector). If you want the FULL Hessian (including all pairwise interactions between variables), you'll need to to start with a single super-vector containing every variable you care about, then slice from there

	def hessian_function(gradient_function, model_vars):
	hess = []
	for idx, variable in enumerate(model_vars):
	# take specific gradient component
	def grad_fi_function(model_vars):
	grads = gradient_function(model_vars)
	return grads[idx]

	grads_fi = tfe.gradients_function(grad_fi_function)
	grads_i = grads_fi(model_vars)[0]
	hess.append(grads_i.numpy()) # hessian is symmetric, don't need to care here

	return tf.convert_to_tensor(hess,dtype=tf.float32)

	hess = hessian_function(gradient_function, model_vars)

	# hessian fixing: make hessian always invertible
	hess = hess + 0.01*tf.eye(tf.shape(hess)[0])

	# compute inverse hessian
	inv_hess = tf.matrix_inverse(hess)
	# Compute increment which updates weigh vector: W1 = W - alphaH^(-1)G
	# delta := H^(-1)*G
	delta = tf.transpose(tf.matmul(inv_hess, tf.expand_dims(grads,-1)))[0]


	# perform line search for the best alpha
	opt = tf.train.AdamOptimizer(learning_rate=1.0) #SGD explodes somewhy
	for i in range(50):
	# create function which depends on alpha with delta fixed
	with tf.GradientTape() as tape:
	candidate_vars = model_vars - alpha*delta
	W1, b1 = vec_to_variables(candidate_vars)
	loss_alpha = compute_loss(y, predict(W1,b1))

	alpha_grad = tape.gradient(loss_alpha, [alpha])[0]

	# line search: find optimal value for alpha
	opt.apply_gradients([(alpha_grad, alpha)], global_step=tf.train.get_or_create_global_step())


	# compute alpha and store delta
	stored_delta.assign(delta)

	# update model weights
	model_vars_new = model_vars - alpha*stored_delta

	# update last active layer and add new identity layer to be trained on the next iteration

	identity_layer = tf.concat([tf.reshape(tf.eye(shape_w[0], num_columns=shape_b[0], dtype=tf.float32), [-1]), tf.zeros(shape_b[0], dtype=tf.float32)], axis = 0)


	weights_new = tf.concat([weights[:-1], [model_vars_new], [identity_layer]], axis=0) # pack along first dim

	# compute new loss
	W, b = vec_to_variables(model_vars_new)
	pred = predict(W,b)
	loss = compute_loss(y, pred)

	return hess, alpha.numpy(), loss.numpy(), evaluate_accuracy(pred, y).numpy(), weights_new.numpy()





	####
	# 4. Main loop to fit and evaluate the model
	####


	weights = tf.convert_to_tensor([tf.ones(shape=[num_variables], dtype=tf.float32)])



	for iteration in range(10):

	# set learning rate value from which line search is started
	alpha.assign(1.0)

	# I'm cautious and separate those operations,
	# as I want hess and alpha to be computed first
	hessian, final_alpha, new_loss, accuracy, weights = model(train_datax,train_datay, weights)

	# mode has 650 parameters: 64x10 weight matrix and 10 biases
	print("Hessian shape: {}".format(hessian.shape))
	print("Hessian morm: {}".format(np.linalg.norm(hessian)))

	print("Alpha{}: {}".format(iteration, final_alpha))


	print("Loss after iteration {}: {}".format(iteration, new_loss))

	print("Train accuracy: %.2f %% " % (accuracy*100.0))
	print("------------------------")