Skip to content

Instantly share code, notes, and snippets.

@thunderInfy
Created October 9, 2019 19:01
Show Gist options
  • Save thunderInfy/c2de6c2fc709275fdb1a768a67a06ef3 to your computer and use it in GitHub Desktop.
Save thunderInfy/c2de6c2fc709275fdb1a768a67a06ef3 to your computer and use it in GitHub Desktop.
def expected_reward(state, action):
global value
"""
state : It's a pair of integers, # of cars at A and at B
action : # of cars transferred from A to B, -5 <= action <= 5
"""
ψ = 0 #reward
new_state = [max(min(state[0] - action, jcp.max_cars()),0) , max(min(state[1] + action, jcp.max_cars()),0)]
# adding reward for moving cars from one location to another (which is negative)
ψ = ψ + jcp.moving_reward() * abs(action)
#there are four discrete random variables which determine the probability distribution of the reward and next state
for Aα in range(A.poissonα.α, A.poissonα.β):
for Bα in range(B.poissonα.α, B.poissonα.β):
for Aβ in range(A.poissonβ.α, A.poissonβ.β):
for Bβ in range(B.poissonβ.α, B.poissonβ.β):
"""
Aα : sample of cars requested at location A
Aβ : sample of cars returned at location A
Bα : sample of cars requested at location B
Bβ : sample of cars returned at location B
ζ : probability of this event happening
"""
# all four variables are independent of each other
ζ = A.poissonα.vals[Aα] * B.poissonα.vals[Bα] * A.poissonβ.vals[Aβ] * B.poissonβ.vals[Bβ]
valid_requests_A = min(new_state[0], Aα)
valid_requests_B = min(new_state[1], Bα)
rew = (valid_requests_A + valid_requests_B)*(jcp.credit_reward())
#calculating the new state based on the values of the four random variables
new_s = [0,0]
new_s[0] = max(min(new_state[0] - valid_requests_A + Aβ, jcp.max_cars()),0)
new_s[1] = max(min(new_state[1] - valid_requests_B + Bβ, jcp.max_cars()),0)
#Bellman's equation
ψ += ζ * (rew + jcp.γ() * value[new_s[0]][new_s[1]])
return ψ
def policy_evaluation():
global value
# here policy_evaluation has a static variable ε whose values decreases over time
ε = policy_evaluation.ε
policy_evaluation.ε /= 10
while(1):
δ = 0
for i in range(value.shape[0]):
for j in range(value.shape[1]):
# value[i][j] denotes the value of the state [i,j]
old_val = value[i][j]
value[i][j] = expected_reward([i,j], policy[i][j])
δ = max(δ, abs(value[i][j] - old_val))
print('.', end = '')
sys.stdout.flush()
print(δ)
sys.stdout.flush()
if δ < ε:
break
#initial value of ε
policy_evaluation.ε = 50
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment