erantone/backprop.jl

## backprop.jl
#     - Batch and Stochastic Backpropagation Implementation in Julia based on the book "Pattern Classification" by Richard Duda
#     - Neural network with 1 hidden layer applied to XOR classification problem
#     Copyright (C) 2015  Eric Aislan Antonelo

#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.

using Gadfly
using Distributions

d = 2  # input dimension
n_H = 2 # number of hidden units
c = 2 # number of classes
n_O = c == 2 ? 1 : c # no of output units
##
w_ = 1/sqrt(d)
#w_ = 0.1
dist = Uniform(-w_, +w_)
w_H = rand(dist, (d + 1, n_H))  # hidden units weights
##
w_ = 1/sqrt(n_H)
#w_ = 0.1
dist = Uniform(-w_, +w_)
w_O = rand(dist, (n_H + 1, n_O))  # output units weights

# dataset
X = ([-1, 1], [1, -1], [1, 1], [0, 0])
T = (1, 1, -1, -1)
N = length(X)


function sigmoid(net)
    a, b = 1.716, 2/3.0
    a * tanh(b * net)
end
function sigmoid_first_derivative(net)
    a, b = 1.716, 2/3.0
    a * b * ( (cosh(b*net)*cosh(net) - sinh(b*net) * sinh(net)) / cosh(b*net)^2)
end

function forward_propagation(x, w_O, w_H)
    net_H = transpose(w_H) * [x, 1]
    y_H = sigmoid(net_H)
    net_O = transpose(w_O) * [y_H, 1]
    (net_H, y_H), (net_O, sigmoid(net_O))
end

function train(x, t, w_O, w_H)
    learning_rate = 0.4
    (net_H, y), (net_O, z) = forward_propagation(x, w_O, w_H)

    delta_O = (t - z) .* map(sigmoid_first_derivative, net_O)
    dw_O = learning_rate *  [y, 1] * transpose(delta_O) # n_H x n_O

    delta_H = map(sigmoid_first_derivative, net_H) .* (w_O[1:n_H,:] * delta_O)
    dw_H = learning_rate * [x, 1] * transpose(delta_H)   # d x n_H

    #w_O += dw_O
    #w_H += dw_H
    #w_O, w_H
    dw_O, dw_H
end

function J(w_O, w_H, X, T) #(w_O, w_H, X, T)
    err = 0.0
    for m in 1:N  # size(X, 1)
        x = X[m]
        t = T[m]
        (net_H, y), (net_O, z) = forward_propagation(x, w_O, w_H)
        err += sum((t - z).^2)  # sum for all classes (output units)
    end
    err / 2.0N # N
end

function batch_backprop(w_O, w_H)
    epochs = 100
    err = zeros(epochs, 1)
    for r in 1:epochs  # epochs
        dw_O = zeros(size(w_O))
        dw_H = zeros(size(w_H))
        for m in 1:N  # N training examples
            x, t = X[m], T[m]
            dw_O_, dw_H_ = train(x, t, w_O, w_H)
            #println(size(dw_O_))
            #println(size(dw_H_))
            dw_O += dw_O_
            dw_H += dw_H_
        end
        w_O += dw_O / N
        w_H += dw_H / N
        err[r] = J(w_O, w_H, X, T)  #J(X,T)
        print(transpose(w_O))
        print("\n")
        print(err[r])
        print("\n")
    end
    err, w_O, w_H
end

function stochastic_backprop(w_O, w_H, alfa=0.9)
  iterations = 1000
  err = zeros(iterations, 1)
  dw_O = zeros(size(w_O))
  dw_H = zeros(size(w_H))
  for m in 1:iterations
    i = rand(1:N)  # choose random example
    x, t = X[i], T[i]
    dw_O_, dw_H_ = train(x, t, w_O, w_H)
    dw_O = dw_O_ * (1-alfa) + (alfa * dw_O)
    dw_H = dw_H_ * (1-alfa) + (alfa * dw_H)
    w_O += dw_O
    w_H += dw_H
    err[m] = J(w_O, w_H, X, T)  #J(X,T)
    print(transpose(w_O))
    print("\n")
    print(err[m])
    print("\n")
  end
  err, w_O, w_H
end

err, w_O, w_H = batch_backprop(w_O, w_H)
#err, w_O, w_H = stochastic_backprop(w_O, w_H, 0.5)

plot(x=1:size(err,1), y=err)

# err = zeros(N,1)
# for m in 1:N
#     x = X[m]
#     t = T[m]
#     w_O, w_H = train(x, t, w_O, w_H)
#     err[m] = J(X,T)
#     print(transpose(w_O))
#     print("\n")
# end


for m in 1:N  # N training examples
  x = X[m]
  t = T[m]
  (net_H, y), (net_O, z) = forward_propagation(x, w_O, w_H)
  print(x)
  print(": ")
  print(z)
  print("\n")
end

w_O
w_H

[X[1], 1]
	# - Batch and Stochastic Backpropagation Implementation in Julia based on the book "Pattern Classification" by Richard Duda
	# - Neural network with 1 hidden layer applied to XOR classification problem
	# Copyright (C) 2015 Eric Aislan Antonelo

	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.

	using Gadfly
	using Distributions

	d = 2 # input dimension
	n_H = 2 # number of hidden units
	c = 2 # number of classes
	n_O = c == 2 ? 1 : c # no of output units
	##
	w_ = 1/sqrt(d)
	#w_ = 0.1
	dist = Uniform(-w_, +w_)
	w_H = rand(dist, (d + 1, n_H)) # hidden units weights
	##
	w_ = 1/sqrt(n_H)
	#w_ = 0.1
	dist = Uniform(-w_, +w_)
	w_O = rand(dist, (n_H + 1, n_O)) # output units weights

	# dataset
	X = ([-1, 1], [1, -1], [1, 1], [0, 0])
	T = (1, 1, -1, -1)
	N = length(X)


	function sigmoid(net)
	a, b = 1.716, 2/3.0
	a * tanh(b * net)
	end
	function sigmoid_first_derivative(net)
	a, b = 1.716, 2/3.0
	a * b * ( (cosh(bnet)cosh(net) - sinh(bnet) sinh(net)) / cosh(b*net)^2)
	end

	function forward_propagation(x, w_O, w_H)
	net_H = transpose(w_H) * [x, 1]
	y_H = sigmoid(net_H)
	net_O = transpose(w_O) * [y_H, 1]
	(net_H, y_H), (net_O, sigmoid(net_O))
	end

	function train(x, t, w_O, w_H)
	learning_rate = 0.4
	(net_H, y), (net_O, z) = forward_propagation(x, w_O, w_H)

	delta_O = (t - z) .* map(sigmoid_first_derivative, net_O)
	dw_O = learning_rate * [y, 1] * transpose(delta_O) # n_H x n_O

	delta_H = map(sigmoid_first_derivative, net_H) .* (w_O[1:n_H,:] * delta_O)
	dw_H = learning_rate * [x, 1] * transpose(delta_H) # d x n_H

	#w_O += dw_O
	#w_H += dw_H
	#w_O, w_H
	dw_O, dw_H
	end

	function J(w_O, w_H, X, T) #(w_O, w_H, X, T)
	err = 0.0
	for m in 1:N # size(X, 1)
	x = X[m]
	t = T[m]
	(net_H, y), (net_O, z) = forward_propagation(x, w_O, w_H)
	err += sum((t - z).^2) # sum for all classes (output units)
	end
	err / 2.0N # N
	end

	function batch_backprop(w_O, w_H)
	epochs = 100
	err = zeros(epochs, 1)
	for r in 1:epochs # epochs
	dw_O = zeros(size(w_O))
	dw_H = zeros(size(w_H))
	for m in 1:N # N training examples
	x, t = X[m], T[m]
	dw_O_, dw_H_ = train(x, t, w_O, w_H)
	#println(size(dw_O_))
	#println(size(dw_H_))
	dw_O += dw_O_
	dw_H += dw_H_
	end
	w_O += dw_O / N
	w_H += dw_H / N
	err[r] = J(w_O, w_H, X, T) #J(X,T)
	print(transpose(w_O))
	print("\n")
	print(err[r])
	print("\n")
	end
	err, w_O, w_H
	end

	function stochastic_backprop(w_O, w_H, alfa=0.9)
	iterations = 1000
	err = zeros(iterations, 1)
	dw_O = zeros(size(w_O))
	dw_H = zeros(size(w_H))
	for m in 1:iterations
	i = rand(1:N) # choose random example
	x, t = X[i], T[i]
	dw_O_, dw_H_ = train(x, t, w_O, w_H)
	dw_O = dw_O_ * (1-alfa) + (alfa * dw_O)
	dw_H = dw_H_ * (1-alfa) + (alfa * dw_H)
	w_O += dw_O
	w_H += dw_H
	err[m] = J(w_O, w_H, X, T) #J(X,T)
	print(transpose(w_O))
	print("\n")
	print(err[m])
	print("\n")
	end
	err, w_O, w_H
	end

	err, w_O, w_H = batch_backprop(w_O, w_H)
	#err, w_O, w_H = stochastic_backprop(w_O, w_H, 0.5)

	plot(x=1:size(err,1), y=err)

	# err = zeros(N,1)
	# for m in 1:N
	# x = X[m]
	# t = T[m]
	# w_O, w_H = train(x, t, w_O, w_H)
	# err[m] = J(X,T)
	# print(transpose(w_O))
	# print("\n")
	# end


	for m in 1:N # N training examples
	x = X[m]
	t = T[m]
	(net_H, y), (net_O, z) = forward_propagation(x, w_O, w_H)
	print(x)
	print(": ")
	print(z)
	print("\n")
	end

	w_O
	w_H

	[X[1], 1]