thunderInfy/policy_evaluation.py

## policy_evaluation.py
def expected_reward(state, action):
    global value
    """
    state  : It's a pair of integers, # of cars at A and at B
    action : # of cars transferred from A to B,  -5 <= action <= 5
    """

    ψ = 0 #reward
    new_state = [max(min(state[0] - action, jcp.max_cars()),0) , max(min(state[1] + action, jcp.max_cars()),0)]

    # adding reward for moving cars from one location to another (which is negative)

    ψ = ψ + jcp.moving_reward() * abs(action)


    #there are four discrete random variables which determine the probability distribution of the reward and next state

    for Aα in range(A.poissonα.α, A.poissonα.β):
        for Bα in range(B.poissonα.α, B.poissonα.β):
            for Aβ in range(A.poissonβ.α, A.poissonβ.β):
                for Bβ in range(B.poissonβ.α, B.poissonβ.β):
                    """
                    Aα : sample of cars requested at location A
                    Aβ : sample of cars returned at location A
                    Bα : sample of cars requested at location B
                    Bβ : sample of cars returned at location B
                    ζ  : probability of this event happening
                    """

                    # all four variables are independent of each other
                    ζ = A.poissonα.vals[Aα] * B.poissonα.vals[Bα] * A.poissonβ.vals[Aβ] * B.poissonβ.vals[Bβ]

                    valid_requests_A = min(new_state[0], Aα)
                    valid_requests_B = min(new_state[1], Bα)

                    rew = (valid_requests_A + valid_requests_B)*(jcp.credit_reward())

                    #calculating the new state based on the values of the four random variables
                    new_s = [0,0]
                    new_s[0] = max(min(new_state[0] - valid_requests_A + Aβ, jcp.max_cars()),0)
                    new_s[1] = max(min(new_state[1] - valid_requests_B + Bβ, jcp.max_cars()),0)

                    #Bellman's equation
                    ψ += ζ * (rew + jcp.γ() * value[new_s[0]][new_s[1]])

    return ψ

def policy_evaluation():

    global value

    # here policy_evaluation has a static variable ε whose values decreases over time
    ε = policy_evaluation.ε

    policy_evaluation.ε /= 10

    while(1):
        δ = 0

        for i in range(value.shape[0]):
            for j in range(value.shape[1]):
                # value[i][j] denotes the value of the state [i,j]

                old_val = value[i][j]
                value[i][j] = expected_reward([i,j], policy[i][j])

                δ = max(δ, abs(value[i][j] - old_val))

                print('.', end = '')
                sys.stdout.flush()
        print(δ)
        sys.stdout.flush()

        if δ < ε:
            break

#initial value of ε
policy_evaluation.ε = 50
	def expected_reward(state, action):
	global value
	"""
	state : It's a pair of integers, # of cars at A and at B
	action : # of cars transferred from A to B, -5 <= action <= 5
	"""

	ψ = 0 #reward
	new_state = [max(min(state[0] - action, jcp.max_cars()),0) , max(min(state[1] + action, jcp.max_cars()),0)]

	# adding reward for moving cars from one location to another (which is negative)

	ψ = ψ + jcp.moving_reward() * abs(action)


	#there are four discrete random variables which determine the probability distribution of the reward and next state

	for Aα in range(A.poissonα.α, A.poissonα.β):
	for Bα in range(B.poissonα.α, B.poissonα.β):
	for Aβ in range(A.poissonβ.α, A.poissonβ.β):
	for Bβ in range(B.poissonβ.α, B.poissonβ.β):
	"""
	Aα : sample of cars requested at location A
	Aβ : sample of cars returned at location A
	Bα : sample of cars requested at location B
	Bβ : sample of cars returned at location B
	ζ : probability of this event happening
	"""

	# all four variables are independent of each other
	ζ = A.poissonα.vals[Aα] * B.poissonα.vals[Bα] * A.poissonβ.vals[Aβ] * B.poissonβ.vals[Bβ]

	valid_requests_A = min(new_state[0], Aα)
	valid_requests_B = min(new_state[1], Bα)

	rew = (valid_requests_A + valid_requests_B)*(jcp.credit_reward())

	#calculating the new state based on the values of the four random variables
	new_s = [0,0]
	new_s[0] = max(min(new_state[0] - valid_requests_A + Aβ, jcp.max_cars()),0)
	new_s[1] = max(min(new_state[1] - valid_requests_B + Bβ, jcp.max_cars()),0)

	#Bellman's equation
	ψ += ζ * (rew + jcp.γ() * value[new_s[0]][new_s[1]])

	return ψ

	def policy_evaluation():

	global value

	# here policy_evaluation has a static variable ε whose values decreases over time
	ε = policy_evaluation.ε

	policy_evaluation.ε /= 10

	while(1):
	δ = 0

	for i in range(value.shape[0]):
	for j in range(value.shape[1]):
	# value[i][j] denotes the value of the state [i,j]

	old_val = value[i][j]
	value[i][j] = expected_reward([i,j], policy[i][j])

	δ = max(δ, abs(value[i][j] - old_val))

	print('.', end = '')
	sys.stdout.flush()
	print(δ)
	sys.stdout.flush()

	if δ < ε:
	break

	#initial value of ε
	policy_evaluation.ε = 50