thundergolfer/tute_8_value_iteration.py

## tute_8_value_iteration.py
## USAGE
##
## Step 1: Clone aimacode python repo from Github
## Step 2: Place this script in the root directory of that repo
## Step 3: Run `python tute_8_value_iteration.py`

import types

def value_iteration_stepper(mdp, epsilon=0.001):
    """Solving an MDP by value iteration. [Figure 17.4]"""
    U1 = {s: 0 for s in mdp.states}
    R, T, gamma = mdp.R, mdp.T, mdp.gamma
    while True:
        U = U1.copy()
        delta = 0
        for s in mdp.states:
            U1[s] = R(s) + gamma * max((sum(p*U[s1] for (p, s1) in T(s, a)) for a in mdp.actions(s)), default=0)
            delta = max(delta, abs(U1[s] - U[s]))

        yield U1

act_list = ['N', 'S', 'E', 'W']
terminals = [(1,1), (1,3), (3,1)]
transitions = {
    (1, 1): {
        'N': [(0.7, (1, 1)), (0.3, (1, 1))], 'S': [(0.8, (2,1)), (0.2, (1,2))],
        'E': [(0.6, (1, 2)), (0.4, (2,1))], 'W': [(0.3, (1, 1)), (0.7, (1,1))],
        'STAY': [(1.0, (1,1))]
    },
    (1, 2): {
        'N': [(0.7, (1,2)), (0.3, (1,1))],
        'S': [(0.8, (2,2)), (0.2, (1,3))],
        'E': [(0.6, (1,3)), (0.4, (2,2))],
        'W': [(0.7, (1,1)), (0.3, (1,2))],
        'STAY': [(1.0, (1,2))]
    },
    (1, 3): {
        'N': [(0.7, (1,3)), (0.3, (1,2))],
        'S': [(0.8, (2,3)), (0.2, (1,3))],
        'E': [(0.6, (1,3)), (0.4, (2,3))],
        'W': [(0.7, (1,2)), (0.3, (1,3))],
        'STAY': [(1.0, (1,3))]
    },
    (2, 1): {
        'N': [(0.7, (1,1)), (0.3, (2,1))],
        'S': [(0.8, (3,1)), (0.2, (2,2))],
        'E': [(0.6, (2,2)), (0.4, (3,1))],
        'W': [(0.7, (2,1)), (0.3, (1,1))],
        'STAY': [(1.0, (2,1))]
    },
    (2, 2): {
        'N': [(0.7, (1,2)), (0.3, (2,1))],
        'S': [(0.8, (3,2)), (0.2, (2,3))],
        'E': [(0.6, (2,3)), (0.4, (3,2))],
        'W': [(0.7, (2,1)), (0.3, (1,2))],
        'STAY': [(1.0, (2,2))]
    },
    (2, 3): {
        'N': [(0.7, (1,3)), (0.3, (2,2))],
        'S': [(0.8, (3,3)), (0.2, (2,3))],
        'E': [(0.6, (2,3)), (0.4, (3,3))],
        'W': [(0.7, (2,2)), (0.3, (1,3))],
        'STAY': [(1.0, (2,3))]
    },
    (3, 1): {
        'N': [(0.7, (2,1)), (0.3, (3,1))],
        'S': [(0.8, (3,1)), (0.2, (3,2))],
        'E': [(0.6, (3,2)), (0.4, (3,1))],
        'W': [(0.7, (3,1)), (0.3, (2,1))],
        'STAY': [(1.0, (3,1))]
    },
    (3, 2): {
        'N': [(0.7, (2,2)), (0.3, (3,1))],
        'S': [(0.8, (3,2)), (0.2, (3,3))],
        'E': [(0.6, (3,3)), (0.4, (3,2))],
        'W': [(0.7, (3,1)), (0.3, (2,2))],
        'STAY': [(1.0, (3,2))]
    },
    (3, 3): {
        'N': [(0.7, (2,3)), (0.3, (3,2))],
        'S': [(0.8, (3,3)), (0.2, (3,3))],
        'E': [(0.6, (3,3)), (0.4, (3,3))],
        'W': [(0.7, (3,2)), (0.3, (2,3))],
        'STAY': [(1.0, (3,3))]
    }
}

rewards = {
    (1, 1): 20,
    (1, 2): -1,
    (1, 3): 5,
    (2, 1): -1,
    (2, 2): -1,
    (2, 3): -1,
    (3, 1): -20,
    (3, 2): -1,
    (3, 3): -1
}
states = list(rewards.keys())
gamma = 1
init = (1, 1)

from mdp import MDP

problem = MDP(
    init,
    act_list,
    terminals,
    transitions,
    rewards,
    states,
    gamma
)

# currently aima-code/mdp.py has a bug in actions() method
def fix_actions(self, state):
    if state in self.terminals:
        return []
    else:
        return self.actlist

bound_actions = types.MethodType(fix_actions, problem)
problem.actions = bound_actions

from mdp import value_iteration

iteration = value_iteration_stepper(problem)

print(next(iteration)) # iteration 1
print('\n\n')
print(next(iteration)) # iteration 2
print('\n\n')
print(next(iteration))
	## USAGE
	##
	## Step 1: Clone aimacode python repo from Github
	## Step 2: Place this script in the root directory of that repo
	## Step 3: Run `python tute_8_value_iteration.py`

	import types

	def value_iteration_stepper(mdp, epsilon=0.001):
	"""Solving an MDP by value iteration. [Figure 17.4]"""
	U1 = {s: 0 for s in mdp.states}
	R, T, gamma = mdp.R, mdp.T, mdp.gamma
	while True:
	U = U1.copy()
	delta = 0
	for s in mdp.states:
	U1[s] = R(s) + gamma * max((sum(p*U[s1] for (p, s1) in T(s, a)) for a in mdp.actions(s)), default=0)
	delta = max(delta, abs(U1[s] - U[s]))

	yield U1

	act_list = ['N', 'S', 'E', 'W']
	terminals = [(1,1), (1,3), (3,1)]
	transitions = {
	(1, 1): {
	'N': [(0.7, (1, 1)), (0.3, (1, 1))], 'S': [(0.8, (2,1)), (0.2, (1,2))],
	'E': [(0.6, (1, 2)), (0.4, (2,1))], 'W': [(0.3, (1, 1)), (0.7, (1,1))],
	'STAY': [(1.0, (1,1))]
	},
	(1, 2): {
	'N': [(0.7, (1,2)), (0.3, (1,1))],
	'S': [(0.8, (2,2)), (0.2, (1,3))],
	'E': [(0.6, (1,3)), (0.4, (2,2))],
	'W': [(0.7, (1,1)), (0.3, (1,2))],
	'STAY': [(1.0, (1,2))]
	},
	(1, 3): {
	'N': [(0.7, (1,3)), (0.3, (1,2))],
	'S': [(0.8, (2,3)), (0.2, (1,3))],
	'E': [(0.6, (1,3)), (0.4, (2,3))],
	'W': [(0.7, (1,2)), (0.3, (1,3))],
	'STAY': [(1.0, (1,3))]
	},
	(2, 1): {
	'N': [(0.7, (1,1)), (0.3, (2,1))],
	'S': [(0.8, (3,1)), (0.2, (2,2))],
	'E': [(0.6, (2,2)), (0.4, (3,1))],
	'W': [(0.7, (2,1)), (0.3, (1,1))],
	'STAY': [(1.0, (2,1))]
	},
	(2, 2): {
	'N': [(0.7, (1,2)), (0.3, (2,1))],
	'S': [(0.8, (3,2)), (0.2, (2,3))],
	'E': [(0.6, (2,3)), (0.4, (3,2))],
	'W': [(0.7, (2,1)), (0.3, (1,2))],
	'STAY': [(1.0, (2,2))]
	},
	(2, 3): {
	'N': [(0.7, (1,3)), (0.3, (2,2))],
	'S': [(0.8, (3,3)), (0.2, (2,3))],
	'E': [(0.6, (2,3)), (0.4, (3,3))],
	'W': [(0.7, (2,2)), (0.3, (1,3))],
	'STAY': [(1.0, (2,3))]
	},
	(3, 1): {
	'N': [(0.7, (2,1)), (0.3, (3,1))],
	'S': [(0.8, (3,1)), (0.2, (3,2))],
	'E': [(0.6, (3,2)), (0.4, (3,1))],
	'W': [(0.7, (3,1)), (0.3, (2,1))],
	'STAY': [(1.0, (3,1))]
	},
	(3, 2): {
	'N': [(0.7, (2,2)), (0.3, (3,1))],
	'S': [(0.8, (3,2)), (0.2, (3,3))],
	'E': [(0.6, (3,3)), (0.4, (3,2))],
	'W': [(0.7, (3,1)), (0.3, (2,2))],
	'STAY': [(1.0, (3,2))]
	},
	(3, 3): {
	'N': [(0.7, (2,3)), (0.3, (3,2))],
	'S': [(0.8, (3,3)), (0.2, (3,3))],
	'E': [(0.6, (3,3)), (0.4, (3,3))],
	'W': [(0.7, (3,2)), (0.3, (2,3))],
	'STAY': [(1.0, (3,3))]
	}
	}

	rewards = {
	(1, 1): 20,
	(1, 2): -1,
	(1, 3): 5,
	(2, 1): -1,
	(2, 2): -1,
	(2, 3): -1,
	(3, 1): -20,
	(3, 2): -1,
	(3, 3): -1
	}
	states = list(rewards.keys())
	gamma = 1
	init = (1, 1)

	from mdp import MDP

	problem = MDP(
	init,
	act_list,
	terminals,
	transitions,
	rewards,
	states,
	gamma
	)

	# currently aima-code/mdp.py has a bug in actions() method
	def fix_actions(self, state):
	if state in self.terminals:
	return []
	else:
	return self.actlist

	bound_actions = types.MethodType(fix_actions, problem)
	problem.actions = bound_actions

	from mdp import value_iteration

	iteration = value_iteration_stepper(problem)

	print(next(iteration)) # iteration 1
	print('\n\n')
	print(next(iteration)) # iteration 2
	print('\n\n')
	print(next(iteration))