Last active
December 12, 2018 03:13
-
-
Save raytroop/d4fc09668694abae75173b8bc3b3f0e0 to your computer and use it in GitHub Desktop.
implement Adam optimzer of TensorFlow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## AdamOptimizer\n", | |
"- In this toy project, we implement AdamOptimizer step by step" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import tensorflow as tf\n", | |
"import numpy as np\n", | |
"tf.set_random_seed(42)\n", | |
"np.random.seed(42)\n", | |
"\n", | |
"# for auto-reloading extenrnal modules\n", | |
"# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython\n", | |
"%load_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 1. random data for testing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"x = np.random.randn(20)\n", | |
"y = 2 * x + 3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 2. tf.train.AdamOptimizer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<tf.Variable 'w:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b:0' shape=() dtype=float32_ref>\n", | |
"\n", | |
"compute_gradients:\n", | |
"<tf.Variable 'w:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b:0' shape=() dtype=float32_ref>\n", | |
"\n", | |
"apply_gradients:\n", | |
"<tf.Variable 'w:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'w/Adam:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'w/Adam_1:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b/Adam:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b/Adam_1:0' shape=() dtype=float32_ref>\n" | |
] | |
} | |
], | |
"source": [ | |
"tf.reset_default_graph()\n", | |
"LR = 1\n", | |
"BETA1=0.5\n", | |
"BETA2=0.5 \n", | |
"w = tf.get_variable(name='w', initializer=0.1)\n", | |
"b = tf.get_variable(name='b', initializer=0.0)\n", | |
"y_pred = x * w + b\n", | |
"loss = tf.reduce_sum(tf.pow(y_pred - y, 2))\n", | |
"opt = tf.train.AdamOptimizer(learning_rate=LR, beta1=BETA1, beta2=BETA2)\n", | |
"for var in tf.global_variables():\n", | |
" print(var)\n", | |
"\n", | |
"print('\\ncompute_gradients:')\n", | |
"grads_and_vars = opt.compute_gradients(loss, var_list=[w, b])\n", | |
"for var in tf.global_variables():\n", | |
" print(var)\n", | |
"\n", | |
"print('\\napply_gradients:')\n", | |
"opt_op = opt.apply_gradients(grads_and_vars)\n", | |
"\n", | |
"for var in tf.global_variables():\n", | |
" print(var)\n", | |
" \n", | |
"# same with \n", | |
"# wm = [var for var in tf.global_variables() if var.op.name == 'w/Adam']\n", | |
"wm = [var for var in tf.global_variables() if var.name == 'w/Adam:0'][0]\n", | |
"wv = [var for var in tf.global_variables() if var.name == 'w/Adam_1:0'][0]\n", | |
"bm = [var for var in tf.global_variables() if var.name == 'b/Adam:0'][0]\n", | |
"bv = [var for var in tf.global_variables() if var.name == 'b/Adam_1:0'][0]\n", | |
"\n", | |
"beta1 = [var for var in tf.global_variables() if var.name == 'beta1_power:0'][0]\n", | |
"beta2 = [var for var in tf.global_variables() if var.name == 'beta2_power:0'][0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"```python\n", | |
"with tf.Session() as sess:\n", | |
" sess.run(tf.global_variables_initializer())\n", | |
" for i in range(1000):\n", | |
" loss_, _ = sess.run([loss, opt_op])\n", | |
" if i % 100 == 0:\n", | |
" print('batch i')\n", | |
" print('loss {}'.format(loss_))\n", | |
" print(w.eval(), b.eval())\n", | |
"``` " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"batch 0\n", | |
"loss 206.27879333496094\n", | |
"0.25 0.25\n", | |
"batch 1\n", | |
"loss 82.32612609863281\n", | |
"0.125 0.125\n", | |
"batch 2\n", | |
"loss 20.670351028442383\n", | |
"0.0625 0.0625\n", | |
"batch 3\n", | |
"loss 3.098949909210205\n", | |
"0.03125 0.03125\n", | |
"batch 4\n", | |
"loss 7.379493713378906\n", | |
"0.015625 0.015625\n" | |
] | |
} | |
], | |
"source": [ | |
"with tf.Session() as sess:\n", | |
" sess.run(tf.global_variables_initializer())\n", | |
" for i in range(5):\n", | |
" loss_, _ = sess.run([loss, opt_op])\n", | |
" \n", | |
" print('batch {}'.format(i))\n", | |
" print('loss {}'.format(loss_))\n", | |
" print(beta1.eval(), beta2.eval())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 3. our naive AdamOptimizer" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's implement `AdamOptimizer` follow TensorFlow API\n", | |
"<br>\n", | |
"- Initialization:\n", | |
"$$\n", | |
"m_0 = 0\n", | |
"$$\n", | |
"$$\n", | |
"v_0 = 0\n", | |
"$$\n", | |
"$$\n", | |
"t = 0\n", | |
"$$\n", | |
"- Update rule:\n", | |
"$$\n", | |
"lr_t = lr * \\sqrt{1 - \\beta_2^t} / (1 - \\beta_1^t)\n", | |
"$$\n", | |
"$$\n", | |
"m_t = \\beta_1 * m_{t-1} + (1 - \\beta_1) * g\n", | |
"$$\n", | |
"$$\n", | |
"v_t = \\beta_2 * v_{t-1} + (1 - \\beta_2) * g * g\n", | |
"$$\n", | |
"$$\n", | |
"variable = variable - lr_t * m_t / (\\sqrt{v_t} + \\epsilon)\n", | |
"$$" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class naive_adam:\n", | |
" def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, name='Adam'):\n", | |
" self.lr = tf.convert_to_tensor(learning_rate, dtype=tf.float32)\n", | |
" self.beta1 = tf.convert_to_tensor(beta1, dtype=tf.float32)\n", | |
" self.beta2 = tf.convert_to_tensor(beta2, dtype=tf.float32)\n", | |
" self.epsilon = tf.convert_to_tensor(epsilon, dtype=tf.float32)\n", | |
" self.name = name\n", | |
"\n", | |
" def compute_gradients(self, loss, var_list=None):\n", | |
" return list(zip(tf.gradients(loss, var_list), var_list))\n", | |
"\n", | |
" def apply_gradients(self, grads_and_vars, global_step=None):\n", | |
" beta1 = tf.get_variable('beta1_power', initializer=1.0, trainable=False)\n", | |
" beta2 = tf.get_variable('beta2_power', initializer=1.0, trainable=False)\n", | |
" varm = []\n", | |
" varv = []\n", | |
" for _, var in grads_and_vars:\n", | |
" varm.append(tf.get_variable(name=var.op.name +'/' + self.name, \n", | |
" initializer=0.0, trainable=False))\n", | |
" varv.append(tf.get_variable(name=var.op.name +'/' + self.name + '_1', \n", | |
" initializer=0.0, trainable=False))\n", | |
" update_beta1 = tf.assign(beta1, beta1 * self.beta1)\n", | |
" update_beta2 = tf.assign(beta2, beta2 * self.beta2)\n", | |
"\n", | |
" lr = self.lr * tf.sqrt((tf.constant(1.0) - update_beta2)) / (tf.constant(1.0) - update_beta1)\n", | |
" var_step = []\n", | |
" for mt, vt, (grad, var) in zip(varm, varv, grads_and_vars):\n", | |
" update_mt = mt.assign(self.beta1 * mt + (tf.constant(1.0) - self.beta1) * grad)\n", | |
" update_vt = vt.assign(self.beta2 * vt + (tf.constant(1.0) - self.beta2) * grad * grad)\n", | |
" var_step.append(var.assign_sub(lr * update_mt / (tf.sqrt(update_vt) + self.epsilon)))\n", | |
" update_vars = tf.group(*var_step)\n", | |
"\n", | |
" if global_step is not None:\n", | |
" update_globalstep = tf.assign_add(global_step, 1)\n", | |
" with tf.control_dependencies([update_vars, update_globalstep]):\n", | |
" apply_op = tf.no_op()\n", | |
" else:\n", | |
" with tf.control_dependencies([update_vars]):\n", | |
" apply_op = tf.no_op()\n", | |
" return apply_op" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<tf.Variable 'w:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b:0' shape=() dtype=float32_ref>\n", | |
"\n", | |
"compute_gradients:\n", | |
"<tf.Variable 'w:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b:0' shape=() dtype=float32_ref>\n", | |
"\n", | |
"apply_gradients:\n", | |
"<tf.Variable 'w:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'w/Adam:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'w/Adam_1:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b/Adam:0' shape=() dtype=float32_ref>\n", | |
"<tf.Variable 'b/Adam_1:0' shape=() dtype=float32_ref>\n" | |
] | |
} | |
], | |
"source": [ | |
"tf.reset_default_graph()\n", | |
"w = tf.get_variable(name='w', initializer=0.1)\n", | |
"b = tf.get_variable(name='b', initializer=0.0)\n", | |
"y_pred = x * w + b\n", | |
"loss = tf.reduce_sum(tf.pow(y_pred - y, 2))\n", | |
"opt = naive_adam(learning_rate=LR, beta1=BETA1, beta2=BETA2)\n", | |
"for var in tf.global_variables():\n", | |
" print(var)\n", | |
"\n", | |
"print('\\ncompute_gradients:')\n", | |
"grads_and_vars = opt.compute_gradients(loss, var_list=[w, b])\n", | |
"for var in tf.global_variables():\n", | |
" print(var)\n", | |
"\n", | |
"print('\\napply_gradients:')\n", | |
"opt_op = opt.apply_gradients(grads_and_vars)\n", | |
"\n", | |
"for var in tf.global_variables():\n", | |
" print(var)\n", | |
" \n", | |
"# same with \n", | |
"# wm = [var for var in tf.global_variables() if var.op.name == 'w/Adam']\n", | |
"wm = [var for var in tf.global_variables() if var.name == 'w/Adam:0'][0]\n", | |
"wv = [var for var in tf.global_variables() if var.name == 'w/Adam_1:0'][0]\n", | |
"bm = [var for var in tf.global_variables() if var.name == 'b/Adam:0'][0]\n", | |
"bv = [var for var in tf.global_variables() if var.name == 'b/Adam_1:0'][0]\n", | |
"\n", | |
"beta1 = [var for var in tf.global_variables() if var.name == 'beta1_power:0'][0]\n", | |
"beta2 = [var for var in tf.global_variables() if var.name == 'beta2_power:0'][0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"batch 0\n", | |
"loss 206.27879333496094\n", | |
"0.5 0.5\n", | |
"batch 1\n", | |
"loss 82.32612609863281\n", | |
"0.25 0.25\n", | |
"batch 2\n", | |
"loss 20.67035675048828\n", | |
"0.125 0.125\n", | |
"batch 3\n", | |
"loss 3.098945379257202\n", | |
"0.0625 0.0625\n", | |
"batch 4\n", | |
"loss 7.379493713378906\n", | |
"0.03125 0.03125\n" | |
] | |
} | |
], | |
"source": [ | |
"with tf.Session() as sess:\n", | |
" sess.run(tf.global_variables_initializer())\n", | |
" for i in range(5):\n", | |
" loss_, _ = sess.run([loss, opt_op])\n", | |
" \n", | |
" print('batch {}'.format(i))\n", | |
" print('loss {}'.format(loss_))\n", | |
" print(beta1.eval(), beta2.eval())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment