Skip to content

Instantly share code, notes, and snippets.

@tano297
Last active December 17, 2018 16:45
Show Gist options
  • Save tano297/913f5ab69f3638f7054706f173535682 to your computer and use it in GitHub Desktop.
Save tano297/913f5ab69f3638f7054706f173535682 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import numpy as np
# define the grid size
size_h = 4
size_w = 4
# define the actions
actions = ["up", "down", "left", "right"]
# define the reward for each action (-1 everywhere for all actions,
# except for the terminal states)
reward = np.full((size_h, size_w, len(actions)), -1.0)
reward[0, 0] = np.zeros((4), dtype=np.float32)
reward[-1, -1] = np.zeros((4), dtype=np.float32)
# define the pi
pi = [0.25, 0.25, 0.25, 0.25]
# s'|s,a in this problem is deterministic, so I can just define it as a 4x4,
transfer = np.zeros((size_h, size_w, len(actions), 2), dtype=np.int32)
for y in range(size_h):
for x in range(size_w):
for a in range(len(actions)):
if actions[a] == "up":
if y > 0:
transfer[y, x, a, 0] = y - 1
else:
transfer[y, x, a, 0] = y
transfer[y, x, a, 1] = x
elif actions[a] == "down":
if y < size_h - 1:
transfer[y, x, a, 0] = y + 1
else:
transfer[y, x, a, 0] = y
transfer[y, x, a, 1] = x
elif actions[a] == "left":
if x > 0:
transfer[y, x, a, 1] = x - 1
else:
transfer[y, x, a, 1] = x
transfer[y, x, a, 0] = y
elif actions[a] == "right":
if x < size_w - 1:
transfer[y, x, a, 1] = x + 1
else:
transfer[y, x, a, 1] = x
transfer[y, x, a, 0] = y
# now fill up the transfer at the end nodes
transfer[0, 0] = np.zeros((len(actions), 2))
transfer[-1, -1] = np.full((len(actions), 2), -1)
# print transfer matrix
print("*" * 80)
print("s'|s,a : ")
for a in range(len(actions)):
print("action: ", actions[a])
print("y: ", transfer[:, :, a, 0])
print("x: ", transfer[:, :, a, 1])
print("*" * 80)
# initial value function
value_0 = np.zeros((size_h, size_w), dtype=np.float32)
print("initial value function")
print(value_0)
# iterate
iterations = 10000
epsilon = 0.0001
for it in range(iterations):
value_t = np.zeros_like(value_0)
# do one bellman step in each state
for y in range(value_0.shape[0]):
for x in range(value_0.shape[1]):
for a, action in enumerate(actions):
# get the coordinates where I go with this action
newy, newx = transfer[y, x, a]
# make one lookahead step for this action
value_t[y, x] += pi[a] * (reward[y, x, a] + value_0[newy, newx])
if it < 3 or it == 9:
print("-" * 40)
print("iterations: ", it + 1)
print(value_t)
# if value converged, exit
norm = 0.0
for y in range(value_t.shape[0]):
for x in range(value_t.shape[1]):
norm += np.abs(value_0[y, x] - value_t[y, x])
norm /= np.array(value_t.shape, dtype=np.float32).sum()
# print(norm)
if norm < epsilon:
print("!" * 80)
print("Exiting loop because I converged the value")
print("!" * 80)
break
else:
# if not converged, save current as old to iterate
value_0 = np.copy(value_t)
print("-" * 40)
print("iterations: ", it + 1)
print("value:")
print(value_t)
@tano297
Copy link
Author

tano297 commented Dec 17, 2018

RL Course David Silver, Lecture 3, minute 19:20

https://youtu.be/Nd1-UUMVfz4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment