Skip to content

Instantly share code, notes, and snippets.

@bshlgrs
Created August 28, 2014 02:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bshlgrs/ea12473130358d2d9614 to your computer and use it in GitHub Desktop.
Save bshlgrs/ea12473130358d2d9614 to your computer and use it in GitHub Desktop.
daniel.py
import numpy
import numpy.random as nprand
import random
# Numbers epsilon \in [0,1] and gamma \in [0,1) entered in by hand at start
# epsilon = raw_input("Enter epsilon \\in [0,1]")
# gamma = raw_input("Enter epsilon \\in [0,1)")
# num_trials = raw_input("how many trials do you want?")
class MDP(object):
def __init__(self, gamma):
self.left_transitions = self.make_table()
self.right_transitions = self.make_table()
self.tables = lambda : [left_transitions, right_transitions]
self.gamma = gamma
def make_table(self):
table = []
for start_state in range(3):
probabilities = [random.random() for x in range(3)]
row = []
for end_state in range(3):
row.append((probabilities[end_state] / sum(probabilities,
random.random())))
table.append(row)
return table
def solve_q_values(self):
return solve_v_values(self.table(), self.gamma)
# Table is an array with dimension num_actions * num_states * num_states
# each entry is a tuple of (probability, reward)
simple_problem = [[[
(0.5, 1), # from 0 to 0
(0.5, 0)], # from 0 to 1
[(0.7, 0.5), # from 1 to 0
(0.3, 3)]]] # from 1 to 1
other_simple_problem = [[[
(1, 1), # from 0 to 0
(0, 1)], # from 0 to 1
[(1, 0), # from 1 to 0
(0, 0)]]]
trivial_problem = [[[(1, 1)]]]
def solve_v_values(table, gamma, optimal = True):
number_of_states = len(table[0])
number_of_actions = len(table)
values = [0] * number_of_states
policy = [0] * number_of_states
delta = 1
while delta > 0.00001:
delta = 0
# policy evaluation
for state in range(number_of_states):
old_value = values[state]
new_value = 0
action = policy[state]
for new_state in range(number_of_states):
prob, reward = table[action][state][new_state]
new_value += prob * (reward + gamma * values[new_state])
values[state] = new_value
delta = max(delta, abs(new_value - old_value))
if optimal:
# policy improvement
for state in range(number_of_states):
best_val = 0
best_action = 0
for action in range(number_of_actions):
val = 0
for new_state in range(number_of_states):
prob, reward = table[action][state][new_state]
val += prob * values[new_state]
if val > best_val:
best_action = action
best_val = val
policy[state] = best_action
return (values, policy)
def solve_q_values(table, gamma):
number_of_states = len(table[0])
number_of_actions = len(table)
values, policy = solve_v_values
print solve_v_values(other_simple_problem, 0.1)
# Want to generate a random MDP with three states {a,b,c}, two actions {x,y}, and rewards in [0,1].
# Need two random 3x3 matrices with entries T^i_{jk} specifying the probability of moving from state j to state k given action i.
# Also need two random 3x3 matrices with entries R^i_{jk} specifying the reward gained when transitioning from state a to state b given action c. (Rewards are deterministic here, randomly generated rewards that are a stochastic function of start state, end state, and action would be better)
# Require that |Q*(a,i) - Q*(b,i)| < epsilon for all actions i.
# Want a joint probability distribution on {a,b} called B(a) and B(b).
# Reduced MDP has two states: {ab,c}, two actions {x,y}, and rewards in [0,1]. Transition probabilities are (lhs is reduced MDP, rhs is old mdp):
# p(ab,r|c,i) = p(a,r|c,i) + p(b,r|c,i)
# p(c,r|c,i) = p(c,r|c,i)
# p(c,r|ab,i) = B(a)p(c,r|a,i) + B(b)p(c,r|b,i)
# p(ab,r|ab,i) = B(a)(p(a,r|a,i) + p(b,r|b,i)) + B(b)(p(a,r|b,i) + p(b,r|b,i))
# Solve for optimal value V* of each state in the original MDP
# Solve for optimal policy in the reduced MDP
# Solve for value V^P of each state in the original MDP where you in states a and b you execute the optimal policy for state ab of the reduced MDP, and in state c you execute the optimal policy for state c of the reduced MDP.
# Return the max difference V* - V^P where states are varied.
# Repeat a bajillion times. Return the maximum difference V* - V^P, and the parameters T^i_{jk}, R^i_{jk}, and B(a).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment