Skip to content

Instantly share code, notes, and snippets.

@domarps
Created May 1, 2018 01:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save domarps/8e390411940a6c3b712cdaf95f009040 to your computer and use it in GitHub Desktop.
Save domarps/8e390411940a6c3b712cdaf95f009040 to your computer and use it in GitHub Desktop.
def layernorm_forward(x, gamma, beta, ln_param):
"""
Forward pass for layer normalization.
During both training and test-time, the incoming data is normalized per data-point,
before being scaled by gamma and beta parameters identical to that of batch normalization.
Note that in contrast to batch normalization, the behavior during train and test-time for
layer normalization are identical, and we do not need to keep track of running averages
of any sort.
Input:
- x: Data of shape (N, D)
- gamma: Scale parameter of shape (D,)
- beta: Shift paremeter of shape (D,)
- ln_param: Dictionary with the following keys:
- eps: Constant for numeric stability
Returns a tuple of:
- out: of shape (N, D)
- cache: A tuple of values needed in the backward pass
"""
out, cache = None, None
eps = ln_param.get('eps', 1e-5)
N, D = x.shape
xT = x.T
###########################################################################
# TODO: Implement the training-time forward pass for layer norm. #
# Normalize the incoming data, and scale and shift the normalized data #
# using gamma and beta. #
# HINT: this can be done by slightly modifying your training-time #
# implementation of batch normalization, and inserting a line or two of #
# well-placed code. In particular, can you think of any matrix #
# transformations you could perform, that would enable you to copy over #
# the batch norm code and leave it almost unchanged? #
###########################################################################
feature_mean = np.mean(xT, axis = 0) # feature mean (D,)
feature_var = np.var(xT, axis = 0) # feature variance (D,)
scaled_xT = xT - feature_mean
normalize_xT = (xT - feature_mean)/np.sqrt(feature_var + eps)
normalize_x = normalize_xT.T
scaled_x = scaled_xT.T
out = gamma * normalize_x + beta
cache = {
'scaled_x' : scaled_x, # (N, D)
'normalized_x' : normalize_x, # (N, D)
'gamma' : gamma, #(D,)
'ivar' : 1./np.sqrt(feature_var + eps), # (D,)
'sqrtvar' : np.sqrt(feature_var + eps) # (D,)
}
###########################################################################
# END OF YOUR CODE #
###########################################################################
return out, cache
def layernorm_backward(dout, cache):
"""
Backward pass for layer normalization.
For this implementation, you can heavily rely on the work you've done already
for batch normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, D)
- cache: Variable of intermediates from layernorm_forward.
Returns a tuple of:
- dx: Gradient with respect to inputs x, of shape (N, D)
- dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
- dbeta: Gradient with respect to shift parameter beta, of shape (D,)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for layer norm. #
# #
# HINT: this can be done by slightly modifying your training-time #
# implementation of batch normalization. The hints to the forward pass #
# still apply! #
###########################################################################
N, D = dout.shape
normalized_x = cache.get('normalized_x')
gamma = cache.get('gamma')
ivar = cache.get('ivar')
scaled_x = cache.get('scaled_x')
sqrtvar = cache.get('sqrtvar')
# backprop dout to calculate dbeta and dgamma
dbeta = np.sum(dout, axis = 0)
dgamma = np.sum(dout * normalized_x, axis = 0)
#print(dbeta.shape, dgamma.shape, sqrtvar.shape)
dx.T = (1 / N) * gamma * (1/sqrtvar * ((N * dout.T)) - np.sum(dout.T, axis=0) - ((scaled_x.T) * np.square(ivar) * np.sum(dout.T*scaled_x, axis=0)))
dx = dx.T
#print(dx.shape)
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment