Skip to content

Instantly share code, notes, and snippets.

@pranv
Last active July 6, 2017 11:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save pranv/4a94c8a151703c910472c5d023be9bf4 to your computer and use it in GitHub Desktop.
Save pranv/4a94c8a151703c910472c5d023be9bf4 to your computer and use it in GitHub Desktop.
An Efficient, Batched, Stateful LSTM layer in Numpy
import numpy as np
from utils import orthogonal, tanh, sigmoid, dtanh, dsigmoid
class LSTM(object):
"""Long Short Term Memory Unit
Parameters
----------
d_input : int
Length of input per time step
d_hidden : int
Number of LSTM cells
f_bias_init : int
Forget Gate bias initialization. In long term memory tasks,
having a larger positive value could be crucial for learning
long range dependencies
name: str
A label for debugging purposes
"""
def __init__(self, d_input, d_hidden, f_bias_init=1.0, name=''):
# use a single concatenated matrix for all gates and input
W = np.empty((4 * d_hidden, d_input + d_hidden + 1))
# orthogonal input -> hidden, identity hidden -> hidden, all biases except forget gate 0
for i in range(4):
W[i*d_hidden:(i + 1) * d_hidden, :d_input] = orthogonal((d_hidden, d_input))
W[i*d_hidden:(i + 1) * d_hidden, d_input:-1] = np.eye(d_hidden)
W[2 * d_hidden:3 * d_hidden, -1] = f_bias_init
self.W = W
self.d_input, self.d_hidden, self.name = d_input, d_hidden, name
self.forget()
def __call__(self, X):
X = X[0]
B = X.shape[1]
d_input, d_hidden = self.d_input, self.d_hidden
if self.t == 0:
self.c_acc[-1] = np.zeros((d_hidden, B))
self.H_p = np.zeros((d_hidden, B))
t = self.t
inp = np.zeros((d_input + d_hidden + 1, B))
inp[:d_input] = X
inp[d_input:-1] = self.H_p
V = np.dot(self.W, inp)
V[:d_hidden] = tanh(V[:d_hidden])
V[d_hidden:] = sigmoid(V[d_hidden:])
Z, I, F, O = np.split(V, 4, axis=0)
c = Z * I + F * self.c_acc[t-1]
C = tanh(c)
H = O * C
# accumulate for backprop
self.c_acc[t] = c; self.C_acc[t] = C; self.Z_acc[t] = Z
self.I_acc[t] = I; self.F_acc[t] = F; self.O_acc[t] = O
self.inp_acc[t] = inp; self.H_p = H; self.t += 1
return H[np.newaxis]
def forward(self, X):
T, n, B = X.shape
H = np.empty((T, self.d_hidden, B))
for t in xrange(T):
H[t] = self.__call__(X[t:t+1])
return H
def backward(self, dH):
T, _, B = dH.shape
d_input, d_hidden = self.d_input, self.d_hidden
dW = np.zeros_like(self.W)
dX = np.zeros((T, d_input, B))
dh_p = np.zeros((d_hidden, B))
dc_p = np.zeros((d_hidden, B))
for t in reversed(xrange(T)):
c = self.c_acc[t]; C = self.C_acc[t]; Z = self.Z_acc[t]
I = self.I_acc[t]; F = self.F_acc[t]; O = self.O_acc[t]
inp = self.inp_acc[t]
dh = dH[t] + dh_p
dO = C * dh
dC = O * dh
dc = (1.0 - C ** 2) * dC
dc = dc + dc_p
dF = self.c_acc[t-1] * dc
dc_p = F * dc
dI = Z * dc
dZ = I * dc
dz = dtanh(dZ, Z)
di = dsigmoid(dI, I)
df = dsigmoid(dF, F)
do = dsigmoid(dO, O)
dV = np.concatenate([dz, di, df, do], axis=0)
dW += np.dot(dV, inp.T)
dinp = np.dot(self.W.T, dV)
dX[t] += dinp[:d_input]
dh_p = dinp[d_input:-1]
self.dW = dW
self.forget()
return dX
def set_parameters(self, P):
self.W = np.reshape(P, self.W.shape)
def get_parameters(self):
return self.W.flatten()
def get_gradients(self):
dP = self.dW.flatten()
dW = None
dX = None
dh_p = None
dc_p = None
return dP
def forget(self):
self.t = 0
self.c_acc = {}; self.C_acc = {}; self.Z_acc = {}
self.I_acc = {}; self.F_acc = {}; self.O_acc = {}
self.inp_acc = {}
self.H_p = None
def __repr__(self):
return 'Layer: ' + self.name + '\tNumber of Parameters: ' + str(self.W.size)
import numpy as np
def orthogonal(shape):
"""
taken from: https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py#L327-L367
"""
a = np.random.normal(0.0, 1.0, shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
q = u if u.shape == shape else v # pick the one with the correct shape
return q
def tanh(X):
return np.tanh(X)
def sigmoid(X):
return 1.0 / (1.0 + np.exp(-X))
def dtanh(dY, Y):
return (1.0 - Y ** 2) * dY
def dsigmoid(dY, Y):
return Y * (1.0 - Y) * dY
@pranv
Copy link
Author

pranv commented Jun 8, 2016

All tensors have to be 3 dimensional with first dimension indexing time and the last indexing the batch.

  • forward: Forward pass for T time steps
  • __call__ or L(X): 1 step forward pass
  • backward: Backward pass with respect to all the forward calls so far

@arjunmenon
Copy link

Hey can you put up an example file to use this for a nlp task?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment