Skip to content

Instantly share code, notes, and snippets.

@jskDr
Last active October 26, 2019 16:20
Show Gist options
  • Save jskDr/10115fe206e47af8f41c2bce55abb358 to your computer and use it in GitHub Desktop.
Save jskDr/10115fe206e47af8f41c2bce55abb358 to your computer and use it in GitHub Desktop.
Comparion policy gradient codes implemented by TF 2.0 and PyTorch based on https://medium.com/@hamza.emra/reinforcement-learning-with-tensorflow-2-0-cca33fead626
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#Framework = 'PyTorch' \n",
"Framework = 'TF2'\n",
"if Framework == 'TF2':\n",
" import tensorflow as tf \n",
"else: \n",
" import torch"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import gym\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"sequential\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"dense (Dense) (None, 32) 160 \n",
"_________________________________________________________________\n",
"dense_1 (Dense) (None, 2) 66 \n",
"=================================================================\n",
"Total params: 226\n",
"Trainable params: 226\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"None\n"
]
}
],
"source": [
"if Framework == 'TF2':\n",
" model = tf.keras.Sequential()\n",
" model.add(tf.keras.layers.Dense(32, input_dim = 4, activation='relu'))\n",
" model.add(tf.keras.layers.Dense(2, activation = \"softmax\"))\n",
" model.build()\n",
" optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)\n",
" compute_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
" print(model.summary())\n",
"else:\n",
" class Model(torch.nn.Module):\n",
" def __init__(self):\n",
" super(Model, self).__init__()\n",
" self.l1 = torch.nn.Linear(4, 32)\n",
" self.l2 = torch.nn.Linear(32, 2)\n",
" \n",
" def forward(self, x):\n",
" net = torch.nn.Sequential(\n",
" self.l1, \n",
" torch.nn.ReLU(),\n",
" self.l2,\n",
" torch.nn.Softmax(dim=-1)\n",
" )\n",
" return net(x)\n",
" model = Model() \n",
" optimizer = torch.optim.Adam(model.parameters(), lr=0.01)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def discount_rewards(r, gamma = 0.8):\n",
" discounted_r = np.zeros_like(r)\n",
" running_add = 0\n",
" for t in reversed(range(0, r.size)):\n",
" running_add = running_add * gamma + r[t]\n",
" discounted_r[t] = running_add\n",
" return discounted_r\n",
"\n",
"class GradUpdate:\n",
" def __init__(self, model):\n",
" self.Buffer = model.trainable_variables\n",
" self.zero()\n",
" \n",
" def zero(self):\n",
" for ix, grad in enumerate(self.Buffer):\n",
" self.Buffer[ix] = grad * 0\n",
" \n",
" def update(self, ep_memory):\n",
" for grads, r in ep_memory:\n",
" for ix, grad in enumerate(grads):\n",
" self.Buffer[ix] += grad * r\n",
" \n",
"def get_action(model, s):\n",
" s = s.reshape([1,4])\n",
" logits = model(s)\n",
" a_dist = logits.numpy()\n",
" # Choose random action with p = action dist\n",
" a = np.random.choice(a_dist[0],p=a_dist[0])\n",
" a = np.argmax(a_dist == a) \n",
" return logits, a"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Simple test"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:Layer dense is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2. The layer has dtype float32 because it's dtype defaults to floatx.\n",
"\n",
"If you intended to run this layer in float32, you can safely ignore this warning. If in doubt, this warning is likely only an issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n",
"\n",
"To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.\n",
"\n",
"Episode 0 Score 18.5\n",
"Episode 100 Score 33.3\n",
"Episode 200 Score 48.85\n",
"Episode 300 Score 17.34\n",
"Episode 400 Score 30.9\n",
"Episode 500 Score 44.21\n",
"Episode 600 Score 49.96\n",
"Episode 700 Score 34.9\n",
"Episode 800 Score 90.5\n",
"Episode 900 Score 76.5\n",
"Episode 1000 Score 74.5\n",
"Episode 1100 Score 82.5\n",
"Episode 1200 Score 52.32\n",
"Episode 1300 Score 90.5\n",
"Episode 1400 Score 145.5\n",
"Episode 1500 Score 104.5\n",
"Episode 1600 Score 150.5\n",
"Episode 1700 Score 150.5\n",
"Episode 1800 Score 150.5\n",
"Episode 1900 Score 150.5\n"
]
}
],
"source": [
"env = gym.make('CartPole-v0')\n",
"episodes = 2000\n",
"scores = []\n",
"update_every = 5\n",
"\n",
"if Framework == 'TF2':\n",
" gradBuffer = GradUpdate(model) \n",
"else: \n",
" optimizer.zero_grad()\n",
"\n",
"for e in range(episodes):\n",
" s = env.reset() \n",
" ep_memory = []\n",
" ep_score = 0\n",
" done = False \n",
" \n",
" if Framework == 'TF2':\n",
" while not done: \n",
" with tf.GradientTape() as tape:\n",
" #forward pass\n",
" s = s.reshape([1,4])\n",
" logits = model(s)\n",
" a_dist = logits.numpy()\n",
" # Choose random action with p = action dist\n",
" a = np.random.choice(a_dist[0],p=a_dist[0])\n",
" a = np.argmax(a_dist == a) \n",
" loss = compute_loss([a], logits)\n",
" # make the choosen action \n",
" s, r, done, _ = env.step(a)\n",
" ep_score +=r\n",
" if done: r-=10\n",
" grads = tape.gradient(loss, model.trainable_variables)\n",
" ep_memory.append([grads,r])\n",
" scores.append(ep_score)\n",
" else : # PyTorch\n",
" while not done:\n",
" s = torch.from_numpy(s).type(torch.FloatTensor)\n",
" logits = model(s)\n",
" c = torch.distributions.Categorical(logits)\n",
" a = c.sample()\n",
" lprob = c.log_prob(a)\n",
" # make the choosen action \n",
" s, r, done, _ = env.step(int(a))\n",
" ep_score +=r\n",
" if done: r-=10\n",
" ep_memory.append([lprob,r])\n",
" scores.append(ep_score)\n",
" \n",
" # Discound the rewards \n",
" ep_memory = np.array(ep_memory)\n",
" ep_memory[:,1] = discount_rewards(ep_memory[:,1]) \n",
" \n",
" if Framework == 'TF2':\n",
" gradBuffer.update(ep_memory) \n",
" if e % update_every == 0:\n",
" optimizer.apply_gradients(zip(gradBuffer.Buffer, model.trainable_variables))\n",
" gradBuffer.zero()\n",
" else:\n",
" for lprob, r in ep_memory:\n",
" r = torch.FloatTensor([r])\n",
" loss = -lprob * torch.autograd.Variable(r)\n",
" loss.backward()\n",
"\n",
" if e % update_every == 0:\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
"\n",
" if e % 100 == 0:\n",
" print(\"Episode {} Score {}\".format(e, np.mean(scores[-100:]))) "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "tf2",
"language": "python",
"name": "tf2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment