Skip to content

Instantly share code, notes, and snippets.

@austinschwartz
Created October 19, 2016 18:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save austinschwartz/36f35b52494b1c6d83efbcb3a9e9e5c5 to your computer and use it in GitHub Desktop.
Save austinschwartz/36f35b52494b1c6d83efbcb3a9e9e5c5 to your computer and use it in GitHub Desktop.
MDP Temporal Difference w/ series of samples
#!/bin/python
v_pi = {
'G': {'right': 0.0, 'up': 0.0},
'W': {'exit': 0.0},
'B': {'up': 0.0, 'left': 0.0},
}
alpha = 0.5
def q_learn(s1, a, s2, r):
qmax = -99999.0
for action in v_pi[s2]:
value = v_pi[s2][action]
qmax = max(qmax, value)
sample = r + qmax
old = v_pi[s1][a]
v_pi[s1][a] = (1.0 - alpha)*(old) + (alpha)*(sample)
st = "(" + s1 + ", " + c(a) + ", " + s2 + ", " + str(r) + ")"
print "Q(" + s1 + ", " + c(a) + ", " + s2 + ", " + str(r) + ") = " + str(v_pi[s1][a] )
def vps():
st = ""
for state in v_pi:
actions = v_pi[state]
for action in actions:
value = actions[action]
st = st + "\\\\Q(" + state + ", " + c(action) + ") = " + str(value) + "\n"
return st
def c(a):
if a == 'right':
return "\\rightarrow"
if a == 'left':
return "\\leftarrow"
if a == 'up':
return "\\uparrow"
if a == 'exit':
return "\\text{EXIT}"
q_learn('G', 'right', 'B', -2)
q_learn('B', 'up', 'B', -2)
q_learn('B', 'left', 'B', -2)
q_learn('B', 'left', 'G', -1)
q_learn('G', 'right', 'B', -2)
q_learn('B', 'up', 'B', -2)
q_learn('G', 'right', 'G', -1)
q_learn('G', 'up', 'G', -1)
q_learn('G', 'right', 'B', -2)
q_learn('G', 'right', 'B', -2)
q_learn('B', 'left', 'G', -1)
q_learn('B', 'up', 'W', 0)
q_learn('B', 'left', 'G', -1)
q_learn('G', 'up', 'G', -1)
q_learn('G', 'up', 'W', 0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment