tano297/policy_evaluation_gridworld_random_policy.py

## policy_evaluation_gridworld_random_policy.py
#!/usr/bin/env python3
import numpy as np

# define the grid size
size_h = 4
size_w = 4

# define the actions
actions = ["up", "down", "left", "right"]

# define the reward for each action (-1 everywhere for all actions,
# except for the terminal states)
reward = np.full((size_h, size_w, len(actions)), -1.0)
reward[0, 0] = np.zeros((4), dtype=np.float32)
reward[-1, -1] = np.zeros((4), dtype=np.float32)

# define the pi
pi = [0.25, 0.25, 0.25, 0.25]

# s'|s,a in this problem is deterministic, so I can just define it as a 4x4,
transfer = np.zeros((size_h, size_w, len(actions), 2), dtype=np.int32)
for y in range(size_h):
  for x in range(size_w):
    for a in range(len(actions)):
      if actions[a] == "up":
        if y > 0:
          transfer[y, x, a, 0] = y - 1
        else:
          transfer[y, x, a, 0] = y
        transfer[y, x, a, 1] = x
      elif actions[a] == "down":
        if y < size_h - 1:
          transfer[y, x, a, 0] = y + 1
        else:
          transfer[y, x, a, 0] = y
        transfer[y, x, a, 1] = x
      elif actions[a] == "left":
        if x > 0:
          transfer[y, x, a, 1] = x - 1
        else:
          transfer[y, x, a, 1] = x
        transfer[y, x, a, 0] = y
      elif actions[a] == "right":
        if x < size_w - 1:
          transfer[y, x, a, 1] = x + 1
        else:
          transfer[y, x, a, 1] = x
        transfer[y, x, a, 0] = y
# now fill up the transfer at the end nodes
transfer[0, 0] = np.zeros((len(actions), 2))
transfer[-1, -1] = np.full((len(actions), 2), -1)

# print transfer matrix
print("*" * 80)
print("s'|s,a : ")
for a in range(len(actions)):
  print("action: ", actions[a])
  print("y: ", transfer[:, :, a, 0])
  print("x: ", transfer[:, :, a, 1])
print("*" * 80)

# initial value function
value_0 = np.zeros((size_h, size_w), dtype=np.float32)

print("initial value function")
print(value_0)

# iterate
iterations = 10000
epsilon = 0.0001
for it in range(iterations):
  value_t = np.zeros_like(value_0)
  # do one bellman step in each state
  for y in range(value_0.shape[0]):
    for x in range(value_0.shape[1]):
      for a, action in enumerate(actions):
        # get the coordinates where I go with this action
        newy, newx = transfer[y, x, a]
        # make one lookahead step for this action
        value_t[y, x] += pi[a] * (reward[y, x, a] + value_0[newy, newx])
  if it < 3 or it == 9:
    print("-" * 40)
    print("iterations: ", it + 1)
    print(value_t)

  # if value converged, exit
  norm = 0.0
  for y in range(value_t.shape[0]):
    for x in range(value_t.shape[1]):
      norm += np.abs(value_0[y, x] - value_t[y, x])
  norm /= np.array(value_t.shape, dtype=np.float32).sum()
  # print(norm)
  if norm < epsilon:
    print("!" * 80)
    print("Exiting loop because I converged the value")
    print("!" * 80)
    break
  else:
    # if not converged, save current as old to iterate
    value_0 = np.copy(value_t)


print("-" * 40)
print("iterations: ", it + 1)
print("value:")
print(value_t)
	#!/usr/bin/env python3
	import numpy as np

	# define the grid size
	size_h = 4
	size_w = 4

	# define the actions
	actions = ["up", "down", "left", "right"]

	# define the reward for each action (-1 everywhere for all actions,
	# except for the terminal states)
	reward = np.full((size_h, size_w, len(actions)), -1.0)
	reward[0, 0] = np.zeros((4), dtype=np.float32)
	reward[-1, -1] = np.zeros((4), dtype=np.float32)

	# define the pi
	pi = [0.25, 0.25, 0.25, 0.25]

	# s'\|s,a in this problem is deterministic, so I can just define it as a 4x4,
	transfer = np.zeros((size_h, size_w, len(actions), 2), dtype=np.int32)
	for y in range(size_h):
	for x in range(size_w):
	for a in range(len(actions)):
	if actions[a] == "up":
	if y > 0:
	transfer[y, x, a, 0] = y - 1
	else:
	transfer[y, x, a, 0] = y
	transfer[y, x, a, 1] = x
	elif actions[a] == "down":
	if y < size_h - 1:
	transfer[y, x, a, 0] = y + 1
	else:
	transfer[y, x, a, 0] = y
	transfer[y, x, a, 1] = x
	elif actions[a] == "left":
	if x > 0:
	transfer[y, x, a, 1] = x - 1
	else:
	transfer[y, x, a, 1] = x
	transfer[y, x, a, 0] = y
	elif actions[a] == "right":
	if x < size_w - 1:
	transfer[y, x, a, 1] = x + 1
	else:
	transfer[y, x, a, 1] = x
	transfer[y, x, a, 0] = y
	# now fill up the transfer at the end nodes
	transfer[0, 0] = np.zeros((len(actions), 2))
	transfer[-1, -1] = np.full((len(actions), 2), -1)

	# print transfer matrix
	print("" 80)
	print("s'\|s,a : ")
	for a in range(len(actions)):
	print("action: ", actions[a])
	print("y: ", transfer[:, :, a, 0])
	print("x: ", transfer[:, :, a, 1])
	print("" 80)

	# initial value function
	value_0 = np.zeros((size_h, size_w), dtype=np.float32)

	print("initial value function")
	print(value_0)

	# iterate
	iterations = 10000
	epsilon = 0.0001
	for it in range(iterations):
	value_t = np.zeros_like(value_0)
	# do one bellman step in each state
	for y in range(value_0.shape[0]):
	for x in range(value_0.shape[1]):
	for a, action in enumerate(actions):
	# get the coordinates where I go with this action
	newy, newx = transfer[y, x, a]
	# make one lookahead step for this action
	value_t[y, x] += pi[a] * (reward[y, x, a] + value_0[newy, newx])
	if it < 3 or it == 9:
	print("-" * 40)
	print("iterations: ", it + 1)
	print(value_t)

	# if value converged, exit
	norm = 0.0
	for y in range(value_t.shape[0]):
	for x in range(value_t.shape[1]):
	norm += np.abs(value_0[y, x] - value_t[y, x])
	norm /= np.array(value_t.shape, dtype=np.float32).sum()
	# print(norm)
	if norm < epsilon:
	print("!" * 80)
	print("Exiting loop because I converged the value")
	print("!" * 80)
	break
	else:
	# if not converged, save current as old to iterate
	value_0 = np.copy(value_t)


	print("-" * 40)
	print("iterations: ", it + 1)
	print("value:")
	print(value_t)