domarps/layer-norm-fwd-bckwd.py

## layer-norm-fwd-bckwd.py
def layernorm_forward(x, gamma, beta, ln_param):
    """
    Forward pass for layer normalization.

    During both training and test-time, the incoming data is normalized per data-point,
    before being scaled by gamma and beta parameters identical to that of batch normalization.

    Note that in contrast to batch normalization, the behavior during train and test-time for
    layer normalization are identical, and we do not need to keep track of running averages
    of any sort.

    Input:
    - x: Data of shape (N, D)
    - gamma: Scale parameter of shape (D,)
    - beta: Shift paremeter of shape (D,)
    - ln_param: Dictionary with the following keys:
        - eps: Constant for numeric stability

    Returns a tuple of:
    - out: of shape (N, D)
    - cache: A tuple of values needed in the backward pass
    """
    out, cache = None, None
    eps = ln_param.get('eps', 1e-5)

    N, D = x.shape
    xT = x.T
    ###########################################################################
    # TODO: Implement the training-time forward pass for layer norm.          #
    # Normalize the incoming data, and scale and  shift the normalized data   #
    #  using gamma and beta.                                                  #
    # HINT: this can be done by slightly modifying your training-time         #
    # implementation of  batch normalization, and inserting a line or two of  #
    # well-placed code. In particular, can you think of any matrix            #
    # transformations you could perform, that would enable you to copy over   #
    # the batch norm code and leave it almost unchanged?                      #
    ###########################################################################
    feature_mean = np.mean(xT, axis = 0) # feature mean (D,)
    feature_var = np.var(xT, axis = 0) # feature variance (D,)

    scaled_xT = xT - feature_mean
    normalize_xT = (xT - feature_mean)/np.sqrt(feature_var + eps)


    normalize_x = normalize_xT.T
    scaled_x = scaled_xT.T

    out = gamma * normalize_x + beta

    cache = {
                'scaled_x' : scaled_x,  # (N, D)
                'normalized_x' : normalize_x,  # (N, D)
                'gamma' : gamma, #(D,)
                'ivar' : 1./np.sqrt(feature_var + eps), # (D,)
                'sqrtvar' : np.sqrt(feature_var + eps)  # (D,)
            }
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return out, cache


def layernorm_backward(dout, cache):
    """
    Backward pass for layer normalization.

    For this implementation, you can heavily rely on the work you've done already
    for batch normalization.

    Inputs:
    - dout: Upstream derivatives, of shape (N, D)
    - cache: Variable of intermediates from layernorm_forward.

    Returns a tuple of:
    - dx: Gradient with respect to inputs x, of shape (N, D)
    - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
    - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
    """
    dx, dgamma, dbeta = None, None, None
    ###########################################################################
    # TODO: Implement the backward pass for layer norm.                       #
    #                                                                         #
    # HINT: this can be done by slightly modifying your training-time         #
    # implementation of batch normalization. The hints to the forward pass    #
    # still apply!                                                            #
    ###########################################################################
    N, D = dout.shape
    normalized_x = cache.get('normalized_x')
    gamma = cache.get('gamma')
    ivar = cache.get('ivar')
    scaled_x = cache.get('scaled_x')
    sqrtvar = cache.get('sqrtvar')


    # backprop dout to calculate dbeta and dgamma
    dbeta = np.sum(dout, axis = 0)
    dgamma = np.sum(dout * normalized_x, axis = 0)


    #print(dbeta.shape, dgamma.shape, sqrtvar.shape)

    dx.T = (1 / N) * gamma * (1/sqrtvar * ((N * dout.T)) - np.sum(dout.T, axis=0) - ((scaled_x.T) * np.square(ivar) * np.sum(dout.T*scaled_x, axis=0)))

    dx = dx.T
    #print(dx.shape)
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return dx, dgamma, dbeta
	def layernorm_forward(x, gamma, beta, ln_param):
	"""
	Forward pass for layer normalization.

	During both training and test-time, the incoming data is normalized per data-point,
	before being scaled by gamma and beta parameters identical to that of batch normalization.

	Note that in contrast to batch normalization, the behavior during train and test-time for
	layer normalization are identical, and we do not need to keep track of running averages
	of any sort.

	Input:
	- x: Data of shape (N, D)
	- gamma: Scale parameter of shape (D,)
	- beta: Shift paremeter of shape (D,)
	- ln_param: Dictionary with the following keys:
	- eps: Constant for numeric stability

	Returns a tuple of:
	- out: of shape (N, D)
	- cache: A tuple of values needed in the backward pass
	"""
	out, cache = None, None
	eps = ln_param.get('eps', 1e-5)

	N, D = x.shape
	xT = x.T
	###########################################################################
	# TODO: Implement the training-time forward pass for layer norm. #
	# Normalize the incoming data, and scale and shift the normalized data #
	# using gamma and beta. #
	# HINT: this can be done by slightly modifying your training-time #
	# implementation of batch normalization, and inserting a line or two of #
	# well-placed code. In particular, can you think of any matrix #
	# transformations you could perform, that would enable you to copy over #
	# the batch norm code and leave it almost unchanged? #
	###########################################################################
	feature_mean = np.mean(xT, axis = 0) # feature mean (D,)
	feature_var = np.var(xT, axis = 0) # feature variance (D,)

	scaled_xT = xT - feature_mean
	normalize_xT = (xT - feature_mean)/np.sqrt(feature_var + eps)



	normalize_x = normalize_xT.T
	scaled_x = scaled_xT.T

	out = gamma * normalize_x + beta

	cache = {
	'scaled_x' : scaled_x, # (N, D)
	'normalized_x' : normalize_x, # (N, D)
	'gamma' : gamma, #(D,)
	'ivar' : 1./np.sqrt(feature_var + eps), # (D,)
	'sqrtvar' : np.sqrt(feature_var + eps) # (D,)
	}
	###########################################################################
	# END OF YOUR CODE #
	###########################################################################
	return out, cache


	def layernorm_backward(dout, cache):
	"""
	Backward pass for layer normalization.

	For this implementation, you can heavily rely on the work you've done already
	for batch normalization.

	Inputs:
	- dout: Upstream derivatives, of shape (N, D)
	- cache: Variable of intermediates from layernorm_forward.

	Returns a tuple of:
	- dx: Gradient with respect to inputs x, of shape (N, D)
	- dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
	- dbeta: Gradient with respect to shift parameter beta, of shape (D,)
	"""
	dx, dgamma, dbeta = None, None, None
	###########################################################################
	# TODO: Implement the backward pass for layer norm. #
	# #
	# HINT: this can be done by slightly modifying your training-time #
	# implementation of batch normalization. The hints to the forward pass #
	# still apply! #
	###########################################################################
	N, D = dout.shape
	normalized_x = cache.get('normalized_x')
	gamma = cache.get('gamma')
	ivar = cache.get('ivar')
	scaled_x = cache.get('scaled_x')
	sqrtvar = cache.get('sqrtvar')


	# backprop dout to calculate dbeta and dgamma
	dbeta = np.sum(dout, axis = 0)
	dgamma = np.sum(dout * normalized_x, axis = 0)


	#print(dbeta.shape, dgamma.shape, sqrtvar.shape)

	dx.T = (1 / N) * gamma * (1/sqrtvar * ((N * dout.T)) - np.sum(dout.T, axis=0) - ((scaled_x.T) * np.square(ivar) * np.sum(dout.T*scaled_x, axis=0)))

	dx = dx.T
	#print(dx.shape)
	###########################################################################
	# END OF YOUR CODE #
	###########################################################################
	return dx, dgamma, dbeta