Created
August 28, 2014 02:01
-
-
Save bshlgrs/ea12473130358d2d9614 to your computer and use it in GitHub Desktop.
daniel.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
import numpy.random as nprand | |
import random | |
# Numbers epsilon \in [0,1] and gamma \in [0,1) entered in by hand at start | |
# epsilon = raw_input("Enter epsilon \\in [0,1]") | |
# gamma = raw_input("Enter epsilon \\in [0,1)") | |
# num_trials = raw_input("how many trials do you want?") | |
class MDP(object): | |
def __init__(self, gamma): | |
self.left_transitions = self.make_table() | |
self.right_transitions = self.make_table() | |
self.tables = lambda : [left_transitions, right_transitions] | |
self.gamma = gamma | |
def make_table(self): | |
table = [] | |
for start_state in range(3): | |
probabilities = [random.random() for x in range(3)] | |
row = [] | |
for end_state in range(3): | |
row.append((probabilities[end_state] / sum(probabilities, | |
random.random()))) | |
table.append(row) | |
return table | |
def solve_q_values(self): | |
return solve_v_values(self.table(), self.gamma) | |
# Table is an array with dimension num_actions * num_states * num_states | |
# each entry is a tuple of (probability, reward) | |
simple_problem = [[[ | |
(0.5, 1), # from 0 to 0 | |
(0.5, 0)], # from 0 to 1 | |
[(0.7, 0.5), # from 1 to 0 | |
(0.3, 3)]]] # from 1 to 1 | |
other_simple_problem = [[[ | |
(1, 1), # from 0 to 0 | |
(0, 1)], # from 0 to 1 | |
[(1, 0), # from 1 to 0 | |
(0, 0)]]] | |
trivial_problem = [[[(1, 1)]]] | |
def solve_v_values(table, gamma, optimal = True): | |
number_of_states = len(table[0]) | |
number_of_actions = len(table) | |
values = [0] * number_of_states | |
policy = [0] * number_of_states | |
delta = 1 | |
while delta > 0.00001: | |
delta = 0 | |
# policy evaluation | |
for state in range(number_of_states): | |
old_value = values[state] | |
new_value = 0 | |
action = policy[state] | |
for new_state in range(number_of_states): | |
prob, reward = table[action][state][new_state] | |
new_value += prob * (reward + gamma * values[new_state]) | |
values[state] = new_value | |
delta = max(delta, abs(new_value - old_value)) | |
if optimal: | |
# policy improvement | |
for state in range(number_of_states): | |
best_val = 0 | |
best_action = 0 | |
for action in range(number_of_actions): | |
val = 0 | |
for new_state in range(number_of_states): | |
prob, reward = table[action][state][new_state] | |
val += prob * values[new_state] | |
if val > best_val: | |
best_action = action | |
best_val = val | |
policy[state] = best_action | |
return (values, policy) | |
def solve_q_values(table, gamma): | |
number_of_states = len(table[0]) | |
number_of_actions = len(table) | |
values, policy = solve_v_values | |
print solve_v_values(other_simple_problem, 0.1) | |
# Want to generate a random MDP with three states {a,b,c}, two actions {x,y}, and rewards in [0,1]. | |
# Need two random 3x3 matrices with entries T^i_{jk} specifying the probability of moving from state j to state k given action i. | |
# Also need two random 3x3 matrices with entries R^i_{jk} specifying the reward gained when transitioning from state a to state b given action c. (Rewards are deterministic here, randomly generated rewards that are a stochastic function of start state, end state, and action would be better) | |
# Require that |Q*(a,i) - Q*(b,i)| < epsilon for all actions i. | |
# Want a joint probability distribution on {a,b} called B(a) and B(b). | |
# Reduced MDP has two states: {ab,c}, two actions {x,y}, and rewards in [0,1]. Transition probabilities are (lhs is reduced MDP, rhs is old mdp): | |
# p(ab,r|c,i) = p(a,r|c,i) + p(b,r|c,i) | |
# p(c,r|c,i) = p(c,r|c,i) | |
# p(c,r|ab,i) = B(a)p(c,r|a,i) + B(b)p(c,r|b,i) | |
# p(ab,r|ab,i) = B(a)(p(a,r|a,i) + p(b,r|b,i)) + B(b)(p(a,r|b,i) + p(b,r|b,i)) | |
# Solve for optimal value V* of each state in the original MDP | |
# Solve for optimal policy in the reduced MDP | |
# Solve for value V^P of each state in the original MDP where you in states a and b you execute the optimal policy for state ab of the reduced MDP, and in state c you execute the optimal policy for state c of the reduced MDP. | |
# Return the max difference V* - V^P where states are varied. | |
# Repeat a bajillion times. Return the maximum difference V* - V^P, and the parameters T^i_{jk}, R^i_{jk}, and B(a). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment