Created
October 9, 2019 19:01
-
-
Save thunderInfy/c2de6c2fc709275fdb1a768a67a06ef3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def expected_reward(state, action): | |
global value | |
""" | |
state : It's a pair of integers, # of cars at A and at B | |
action : # of cars transferred from A to B, -5 <= action <= 5 | |
""" | |
ψ = 0 #reward | |
new_state = [max(min(state[0] - action, jcp.max_cars()),0) , max(min(state[1] + action, jcp.max_cars()),0)] | |
# adding reward for moving cars from one location to another (which is negative) | |
ψ = ψ + jcp.moving_reward() * abs(action) | |
#there are four discrete random variables which determine the probability distribution of the reward and next state | |
for Aα in range(A.poissonα.α, A.poissonα.β): | |
for Bα in range(B.poissonα.α, B.poissonα.β): | |
for Aβ in range(A.poissonβ.α, A.poissonβ.β): | |
for Bβ in range(B.poissonβ.α, B.poissonβ.β): | |
""" | |
Aα : sample of cars requested at location A | |
Aβ : sample of cars returned at location A | |
Bα : sample of cars requested at location B | |
Bβ : sample of cars returned at location B | |
ζ : probability of this event happening | |
""" | |
# all four variables are independent of each other | |
ζ = A.poissonα.vals[Aα] * B.poissonα.vals[Bα] * A.poissonβ.vals[Aβ] * B.poissonβ.vals[Bβ] | |
valid_requests_A = min(new_state[0], Aα) | |
valid_requests_B = min(new_state[1], Bα) | |
rew = (valid_requests_A + valid_requests_B)*(jcp.credit_reward()) | |
#calculating the new state based on the values of the four random variables | |
new_s = [0,0] | |
new_s[0] = max(min(new_state[0] - valid_requests_A + Aβ, jcp.max_cars()),0) | |
new_s[1] = max(min(new_state[1] - valid_requests_B + Bβ, jcp.max_cars()),0) | |
#Bellman's equation | |
ψ += ζ * (rew + jcp.γ() * value[new_s[0]][new_s[1]]) | |
return ψ | |
def policy_evaluation(): | |
global value | |
# here policy_evaluation has a static variable ε whose values decreases over time | |
ε = policy_evaluation.ε | |
policy_evaluation.ε /= 10 | |
while(1): | |
δ = 0 | |
for i in range(value.shape[0]): | |
for j in range(value.shape[1]): | |
# value[i][j] denotes the value of the state [i,j] | |
old_val = value[i][j] | |
value[i][j] = expected_reward([i,j], policy[i][j]) | |
δ = max(δ, abs(value[i][j] - old_val)) | |
print('.', end = '') | |
sys.stdout.flush() | |
print(δ) | |
sys.stdout.flush() | |
if δ < ε: | |
break | |
#initial value of ε | |
policy_evaluation.ε = 50 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment