pantelis/optimal_policy.py

## optimal_policy.py
# Starter code for small 4 cell grid-world
import numpy as np

# outer dict key is state and inner dict key is action
# transition model P(s'|s,a) represented as a dictionary of dictionaries

P = {
 0: {0: [(0.9,0),(0.1,1)], 1: [(0.8,1),(0.1,2),(0.1,0)], 2: [(0.8,2),(0.1,1),(0.1,0)], 3: [(0.9,0),(0.1,2)]},
 1: {0: [(0.9,1),(0.1,0)], 1: [(0.9,1),(0.1,3)], 2: [(0.8,3),(0.1,0),(0.1,1)], 3: [(0.8,0),(0.1,1),(0.1,3)]},
 2: {0: [(0.8,0),(0.1,2),(0.1,3)], 1: [(0.8,3),(0.1,0),(0.1,2)], 2: [(0.9,2),(0.1,3)], 3: [(0.8,3),(0.1,0),(0.1,2)]},
 3: {0: [(0.8,1),(0.1,2),(0.1,3)], 1: [(0.9,3),(0.1,1)], 2: [(0.9,3),(0.1,2)], 3: [(0.8,2),(0.1,1),(0.1,3)]}
 }
print(P)
R = [-10, -10, -10, 10]
gamma = 0.9
States = [0, 1, 2, 3] # states
n_states = len(States)

Actions = [0, 1, 2, 3] # actions [north (up), east (right), south (down), west (left)]
n_actions = len(Actions)
	# Starter code for small 4 cell grid-world
	import numpy as np

	# outer dict key is state and inner dict key is action
	# transition model P(s'\|s,a) represented as a dictionary of dictionaries

	P = {
	0: {0: [(0.9,0),(0.1,1)], 1: [(0.8,1),(0.1,2),(0.1,0)], 2: [(0.8,2),(0.1,1),(0.1,0)], 3: [(0.9,0),(0.1,2)]},
	1: {0: [(0.9,1),(0.1,0)], 1: [(0.9,1),(0.1,3)], 2: [(0.8,3),(0.1,0),(0.1,1)], 3: [(0.8,0),(0.1,1),(0.1,3)]},
	2: {0: [(0.8,0),(0.1,2),(0.1,3)], 1: [(0.8,3),(0.1,0),(0.1,2)], 2: [(0.9,2),(0.1,3)], 3: [(0.8,3),(0.1,0),(0.1,2)]},
	3: {0: [(0.8,1),(0.1,2),(0.1,3)], 1: [(0.9,3),(0.1,1)], 2: [(0.9,3),(0.1,2)], 3: [(0.8,2),(0.1,1),(0.1,3)]}
	}
	print(P)
	R = [-10, -10, -10, 10]
	gamma = 0.9
	States = [0, 1, 2, 3] # states
	n_states = len(States)

	Actions = [0, 1, 2, 3] # actions [north (up), east (right), south (down), west (left)]
	n_actions = len(Actions)