ruggeri/tf_fail.py

## tf_fail.py
import tensorflow as tf

# Notice that x and y have the same initial value, but y_var has
# totally different scale from x_var. Therefore y_var has much farther
# to change than x_var.
x_var = tf.Variable(1.0)
y_var = tf.Variable(100.0)
y_var_scaled = y_var / 100.0

# This is just a silly linear equation. It will have a derivative of 2
# everywhere: in terms of x_var and *y_var_scaled*.
#
# However, the derivative of z_y with respect to y_var is 0.02,
# because y_var is divided by 100.
z_x = 2 * x_var + 5
z_y = 2 * y_var_scaled + 5

LEARNING_RATE = 0.10
train_x = tf.train.GradientDescentOptimizer(
    learning_rate = LEARNING_RATE
).minimize(z_x, var_list = [x_var])
train_y = tf.train.GradientDescentOptimizer(
    learning_rate = LEARNING_RATE
).minimize(z_y, var_list= [y_var])

with tf.Session() as session:
    session.run(tf.global_variables_initializer())

    # We're going to make 100 steps of optimization to both
    # variables. The derivative is 2.0 each time for x_val, so we'll
    # make a step down of 2.0 * 0.10 = 0.2 each time. Over 100 steps,
    # that dcreases x_var by 20.0, so we get x = -19.
    #
    # On the other hand, the derivative is 0.02 each time for
    # y_var. So we'll make a step down of 0.02 * 0.1 = 0.002 each
    # time. This means we'll decrease y_var by 100*0.002 = 0.2. Which
    # gives us a decrease of y_var_scaled by 0.2/100=0.002.
    #
    # The point is this: if we fix the *initialization* of variables,
    # then we don't shrink their *derivatives*. If we leave the same
    # bad initialization and divide *after*, then the derivative
    # becomes super small, even we do succeed at shrinking the tensor
    # to a good scale for operations further down the line.
    for i in range(100):
        _, x_val = session.run([train_x, x_var])
        _, y_val = session.run([train_y, y_var_scaled])

        print(f"(i: {i} | x: {x_val:0.4} | y: {y_val:0.4f}")
	import tensorflow as tf

	# Notice that x and y have the same initial value, but y_var has
	# totally different scale from x_var. Therefore y_var has much farther
	# to change than x_var.
	x_var = tf.Variable(1.0)
	y_var = tf.Variable(100.0)
	y_var_scaled = y_var / 100.0

	# This is just a silly linear equation. It will have a derivative of 2
	# everywhere: in terms of x_var and y_var_scaled.
	#
	# However, the derivative of z_y with respect to y_var is 0.02,
	# because y_var is divided by 100.
	z_x = 2 * x_var + 5
	z_y = 2 * y_var_scaled + 5

	LEARNING_RATE = 0.10
	train_x = tf.train.GradientDescentOptimizer(
	learning_rate = LEARNING_RATE
	).minimize(z_x, var_list = [x_var])
	train_y = tf.train.GradientDescentOptimizer(
	learning_rate = LEARNING_RATE
	).minimize(z_y, var_list= [y_var])

	with tf.Session() as session:
	session.run(tf.global_variables_initializer())

	# We're going to make 100 steps of optimization to both
	# variables. The derivative is 2.0 each time for x_val, so we'll
	# make a step down of 2.0 * 0.10 = 0.2 each time. Over 100 steps,
	# that dcreases x_var by 20.0, so we get x = -19.
	#
	# On the other hand, the derivative is 0.02 each time for
	# y_var. So we'll make a step down of 0.02 * 0.1 = 0.002 each
	# time. This means we'll decrease y_var by 100*0.002 = 0.2. Which
	# gives us a decrease of y_var_scaled by 0.2/100=0.002.
	#
	# The point is this: if we fix the initialization of variables,
	# then we don't shrink their derivatives. If we leave the same
	# bad initialization and divide after, then the derivative
	# becomes super small, even we do succeed at shrinking the tensor
	# to a good scale for operations further down the line.
	for i in range(100):
	_, x_val = session.run([train_x, x_var])
	_, y_val = session.run([train_y, y_var_scaled])

	print(f"(i: {i} \| x: {x_val:0.4} \| y: {y_val:0.4f}")