psycharo-zz/tf_svd_gradient.py

## tf_svd_gradient.py
def mmul(*tensors):
    return tf.foldl(tf.matmul, tensors)

def msym(X):
    return (X + tf.matrix_transpose(X)) / 2

def mdiag(X):
    return tf.matrix_diag(tf.matrix_diag_part(X))

@tf.RegisterGradient('Svd')
def gradient_svd(op, dL_ds, dL_dU, dL_dV):
    s, U, V = op.outputs
    # NOTE: based on https://arxiv.org/pdf/1509.07838.pdf
    # this version works for square matrices only
    # in practice it means that only U_1 part of (17) is used
    assert U.shape == V.shape, U.shape[1] == U.shape[2]
    I = tf.eye(tf.shape(s)[1])
    S = tf.matrix_diag(s)
    dL_dS = tf.matrix_diag(dL_ds)
    V_T = tf.matrix_transpose(V)
    U_T = tf.matrix_transpose(U)
    s_2 = tf.square(s)
    K = 1.0 / (s_2[:,tf.newaxis,:] - s_2[:,:,tf.newaxis] + I) - I
    D = mmul(dL_dU, tf.matrix_diag(1.0 / s))
    D_T = tf.matrix_transpose(D)
    return (mmul(D, V_T) +
            mmul(U, mdiag(dL_dS - mmul(U_T, D)), V_T) +
            2 * mmul(U, S, msym(K * mmul(V_T, dL_dV - mmul(V, D_T, U, S))), V_T))
	def mmul(*tensors):
	return tf.foldl(tf.matmul, tensors)

	def msym(X):
	return (X + tf.matrix_transpose(X)) / 2

	def mdiag(X):
	return tf.matrix_diag(tf.matrix_diag_part(X))

	@tf.RegisterGradient('Svd')
	def gradient_svd(op, dL_ds, dL_dU, dL_dV):
	s, U, V = op.outputs
	# NOTE: based on https://arxiv.org/pdf/1509.07838.pdf
	# this version works for square matrices only
	# in practice it means that only U_1 part of (17) is used
	assert U.shape == V.shape, U.shape[1] == U.shape[2]
	I = tf.eye(tf.shape(s)[1])
	S = tf.matrix_diag(s)
	dL_dS = tf.matrix_diag(dL_ds)
	V_T = tf.matrix_transpose(V)
	U_T = tf.matrix_transpose(U)
	s_2 = tf.square(s)
	K = 1.0 / (s_2[:,tf.newaxis,:] - s_2[:,:,tf.newaxis] + I) - I
	D = mmul(dL_dU, tf.matrix_diag(1.0 / s))
	D_T = tf.matrix_transpose(D)
	return (mmul(D, V_T) +
	mmul(U, mdiag(dL_dS - mmul(U_T, D)), V_T) +
	2 * mmul(U, S, msym(K * mmul(V_T, dL_dV - mmul(V, D_T, U, S))), V_T))