erichooi/LSTM_backpropagation.py

## LSTM_backpropagation.py
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

Wa = np.array([0.45, 0.25]).reshape(1, 2)
Wi = np.array([0.95, 0.8]).reshape(1, 2)
Wf = np.array([0.7, 0.45]).reshape(1, 2)
Wo = np.array([0.6, 0.4]).reshape(1, 2)

Ua = np.array(0.15).reshape(1, 1)
Ui = np.array(0.8).reshape(1, 1)
Uf = np.array(0.1).reshape(1, 1)
Uo = np.array(0.25).reshape(1, 1)

ba = np.array(0.2).reshape(1, 1)
bi = np.array(0.65).reshape(1, 1)
bf = np.array(0.15).reshape(1, 1)
bo = np.array(0.1).reshape(1, 1)

# stack all the weights and biases
W = np.vstack((Wa, Wi, Wf, Wo))
U = np.vstack((Ua, Ui, Uf, Uo))
b = np.vstack((ba, bi, bf, bo))

state_minus_1 = np.array(0).reshape(1, 1)
output_minus_1 = np.array(0).reshape(1, 1)
x0 = np.array([1, 2]).reshape(2, 1)
x1 = np.array([0.5, 3]).reshape(2, 1)
y0 = np.array(0.5).reshape(1, 1)
y1 = np.array(1.25).reshape(1, 1)
# forward prop 0
a0 = np.tanh(np.matmul(Wa, x0) + np.matmul(Ua, output_minus_1) + ba)
i0 = sigmoid(np.matmul(Wi, x0) + np.matmul(Ui, output_minus_1) + bi)
f0 = sigmoid(np.matmul(Wf, x0) + np.matmul(Uf, output_minus_1) + bf)
o0 = sigmoid(np.matmul(Wo, x0) + np.matmul(Uo, output_minus_1) + bo)
state_0 = np.matmul(f0, state_minus_1) + np.matmul(a0, i0)
output_0 = np.matmul(o0, np.tanh(state_0))

# forward prop 1
a1 = np.tanh(np.matmul(Wa, x1) + np.matmul(Ua, output_0) + ba)
i1 = sigmoid(np.matmul(Wi, x1) + np.matmul(Ui, output_0) + bi)
f1 = sigmoid(np.matmul(Wf, x1) + np.matmul(Uf, output_0) + bf)
o1 = sigmoid(np.matmul(Wo, x1) + np.matmul(Uo, output_0) + bo)
state_1 = np.matmul(f1, state_0) + np.matmul(a1, i1)
output_1 = np.matmul(o1, np.tanh(state_1))

# backward prop 1
# future value
dstate_2 = np.array(0).reshape(1, 1)
f2 = np.array(0).reshape(1, 1)

different_error_1 = output_1 - y1 # different with real answer
different_output_1 = 0 # different with future output

doutput_1 = different_error_1 + different_output_1
dstate_1 = doutput_1.dot(o1).dot(1 - np.square(np.tanh(state_1))) + dstate_2.dot(f2)
da1 = dstate_1.dot(i1).dot(1 - np.square(a1))
di1 = dstate_1.dot(a1).dot(i1).dot(1 - i1)
df1 = dstate_1.dot(state_0).dot(f1).dot(1 - f1)
do1 = doutput_1.dot(np.tanh(state_1)).dot(o1).dot(1 - o1)

dgates1 = np.vstack((da1, di1, df1, do1))

dx1 = W.T.dot(dgates1)
different_output_0 = U.T.dot(dgates1)

# backward prop 0
different_error_0 = output_0 - y0
different_output_0 = different_output_0

doutput_0 = different_error_0 + different_output_0
dstate_0 = doutput_0.dot(o0).dot(1 - np.square(np.tanh(state_0))) + dstate_1.dot(f1)
da0 = dstate_0.dot(i0).dot(1 - np.square(a0))
di0 = dstate_0.dot(a0).dot(i0).dot(1 - i0)
df0 = dstate_0.dot(state_minus_1).dot(f0).dot(1 - f0)
do0 = doutput_0.dot(np.tanh(state_0)).dot(o0).dot(1 - o0)

dgates0 = np.vstack((da0, di0, df0, do0))

dx0 = W.T.dot(dgates0)
different_output_minus_1 = U.T.dot(dgates0)

# SGD update with learning rate 0.1
dW = np.add(dgates0.dot(x0.reshape(1, 2)), dgates1.dot(x1.reshape(1, 2)))
dU = dgates1.dot(output_0)
db = np.add(dgates0, dgates1)

# update all weights and biases
W_new = W - 0.1 * dW
U_new = U - 0.1 * dU
b_new = b - 0.1 * db
	import numpy as np

	def sigmoid(x):
	return 1 / (1 + np.exp(-x))

	Wa = np.array([0.45, 0.25]).reshape(1, 2)
	Wi = np.array([0.95, 0.8]).reshape(1, 2)
	Wf = np.array([0.7, 0.45]).reshape(1, 2)
	Wo = np.array([0.6, 0.4]).reshape(1, 2)

	Ua = np.array(0.15).reshape(1, 1)
	Ui = np.array(0.8).reshape(1, 1)
	Uf = np.array(0.1).reshape(1, 1)
	Uo = np.array(0.25).reshape(1, 1)

	ba = np.array(0.2).reshape(1, 1)
	bi = np.array(0.65).reshape(1, 1)
	bf = np.array(0.15).reshape(1, 1)
	bo = np.array(0.1).reshape(1, 1)

	# stack all the weights and biases
	W = np.vstack((Wa, Wi, Wf, Wo))
	U = np.vstack((Ua, Ui, Uf, Uo))
	b = np.vstack((ba, bi, bf, bo))

	state_minus_1 = np.array(0).reshape(1, 1)
	output_minus_1 = np.array(0).reshape(1, 1)
	x0 = np.array([1, 2]).reshape(2, 1)
	x1 = np.array([0.5, 3]).reshape(2, 1)
	y0 = np.array(0.5).reshape(1, 1)
	y1 = np.array(1.25).reshape(1, 1)
	# forward prop 0
	a0 = np.tanh(np.matmul(Wa, x0) + np.matmul(Ua, output_minus_1) + ba)
	i0 = sigmoid(np.matmul(Wi, x0) + np.matmul(Ui, output_minus_1) + bi)
	f0 = sigmoid(np.matmul(Wf, x0) + np.matmul(Uf, output_minus_1) + bf)
	o0 = sigmoid(np.matmul(Wo, x0) + np.matmul(Uo, output_minus_1) + bo)
	state_0 = np.matmul(f0, state_minus_1) + np.matmul(a0, i0)
	output_0 = np.matmul(o0, np.tanh(state_0))

	# forward prop 1
	a1 = np.tanh(np.matmul(Wa, x1) + np.matmul(Ua, output_0) + ba)
	i1 = sigmoid(np.matmul(Wi, x1) + np.matmul(Ui, output_0) + bi)
	f1 = sigmoid(np.matmul(Wf, x1) + np.matmul(Uf, output_0) + bf)
	o1 = sigmoid(np.matmul(Wo, x1) + np.matmul(Uo, output_0) + bo)
	state_1 = np.matmul(f1, state_0) + np.matmul(a1, i1)
	output_1 = np.matmul(o1, np.tanh(state_1))

	# backward prop 1
	# future value
	dstate_2 = np.array(0).reshape(1, 1)
	f2 = np.array(0).reshape(1, 1)

	different_error_1 = output_1 - y1 # different with real answer
	different_output_1 = 0 # different with future output

	doutput_1 = different_error_1 + different_output_1
	dstate_1 = doutput_1.dot(o1).dot(1 - np.square(np.tanh(state_1))) + dstate_2.dot(f2)
	da1 = dstate_1.dot(i1).dot(1 - np.square(a1))
	di1 = dstate_1.dot(a1).dot(i1).dot(1 - i1)
	df1 = dstate_1.dot(state_0).dot(f1).dot(1 - f1)
	do1 = doutput_1.dot(np.tanh(state_1)).dot(o1).dot(1 - o1)

	dgates1 = np.vstack((da1, di1, df1, do1))

	dx1 = W.T.dot(dgates1)
	different_output_0 = U.T.dot(dgates1)

	# backward prop 0
	different_error_0 = output_0 - y0
	different_output_0 = different_output_0

	doutput_0 = different_error_0 + different_output_0
	dstate_0 = doutput_0.dot(o0).dot(1 - np.square(np.tanh(state_0))) + dstate_1.dot(f1)
	da0 = dstate_0.dot(i0).dot(1 - np.square(a0))
	di0 = dstate_0.dot(a0).dot(i0).dot(1 - i0)
	df0 = dstate_0.dot(state_minus_1).dot(f0).dot(1 - f0)
	do0 = doutput_0.dot(np.tanh(state_0)).dot(o0).dot(1 - o0)

	dgates0 = np.vstack((da0, di0, df0, do0))

	dx0 = W.T.dot(dgates0)
	different_output_minus_1 = U.T.dot(dgates0)

	# SGD update with learning rate 0.1
	dW = np.add(dgates0.dot(x0.reshape(1, 2)), dgates1.dot(x1.reshape(1, 2)))
	dU = dgates1.dot(output_0)
	db = np.add(dgates0, dgates1)

	# update all weights and biases
	W_new = W - 0.1 * dW
	U_new = U - 0.1 * dU
	b_new = b - 0.1 * db