Created
December 15, 2017 00:00
-
-
Save ruggeri/fe548797272d4d32b534a9bb1903c852 to your computer and use it in GitHub Desktop.
Shrink initialization; don't scale variables!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
# Notice that x and y have the same initial value, but y_var has | |
# totally different scale from x_var. Therefore y_var has much farther | |
# to change than x_var. | |
x_var = tf.Variable(1.0) | |
y_var = tf.Variable(100.0) | |
y_var_scaled = y_var / 100.0 | |
# This is just a silly linear equation. It will have a derivative of 2 | |
# everywhere: in terms of x_var and *y_var_scaled*. | |
# | |
# However, the derivative of z_y with respect to y_var is 0.02, | |
# because y_var is divided by 100. | |
z_x = 2 * x_var + 5 | |
z_y = 2 * y_var_scaled + 5 | |
LEARNING_RATE = 0.10 | |
train_x = tf.train.GradientDescentOptimizer( | |
learning_rate = LEARNING_RATE | |
).minimize(z_x, var_list = [x_var]) | |
train_y = tf.train.GradientDescentOptimizer( | |
learning_rate = LEARNING_RATE | |
).minimize(z_y, var_list= [y_var]) | |
with tf.Session() as session: | |
session.run(tf.global_variables_initializer()) | |
# We're going to make 100 steps of optimization to both | |
# variables. The derivative is 2.0 each time for x_val, so we'll | |
# make a step down of 2.0 * 0.10 = 0.2 each time. Over 100 steps, | |
# that dcreases x_var by 20.0, so we get x = -19. | |
# | |
# On the other hand, the derivative is 0.02 each time for | |
# y_var. So we'll make a step down of 0.02 * 0.1 = 0.002 each | |
# time. This means we'll decrease y_var by 100*0.002 = 0.2. Which | |
# gives us a decrease of y_var_scaled by 0.2/100=0.002. | |
# | |
# The point is this: if we fix the *initialization* of variables, | |
# then we don't shrink their *derivatives*. If we leave the same | |
# bad initialization and divide *after*, then the derivative | |
# becomes super small, even we do succeed at shrinking the tensor | |
# to a good scale for operations further down the line. | |
for i in range(100): | |
_, x_val = session.run([train_x, x_var]) | |
_, y_val = session.run([train_y, y_var_scaled]) | |
print(f"(i: {i} | x: {x_val:0.4} | y: {y_val:0.4f}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment