Skip to content

Instantly share code, notes, and snippets.

@thunderInfy
Created October 9, 2019 19:08
Show Gist options
  • Save thunderInfy/d976ebb4c9c8fe65698284ce4722fdca to your computer and use it in GitHub Desktop.
Save thunderInfy/d976ebb4c9c8fe65698284ce4722fdca to your computer and use it in GitHub Desktop.
def policy_improvement():
global policy
policy_stable = True
for i in range(value.shape[0]):
for j in range(value.shape[1]):
old_action = policy[i][j]
max_act_val = None
max_act = None
τ12 = min(i,5) # if I have say 3 cars at the first location, then I can atmost move 3 from 1 to 2
τ21 = -min(j,5) # if I have say 2 cars at the second location, then I can atmost move 2 from 2 to 1
for act in range(τ21,τ12+1):
σ = expected_reward([i,j], act)
if max_act_val == None:
max_act_val = σ
max_act = act
elif max_act_val < σ:
max_act_val = σ
max_act = act
policy[i][j] = max_act
if old_action!= policy[i][j]:
policy_stable = False
return policy_stable
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment