bshlgrs/gist:ea12473130358d2d9614

## gistfile1.py
import numpy
import numpy.random as nprand
import random

# Numbers epsilon \in [0,1] and gamma \in [0,1) entered in by hand at start

# epsilon = raw_input("Enter epsilon \\in [0,1]")
# gamma = raw_input("Enter epsilon \\in [0,1)")

# num_trials = raw_input("how many trials do you want?")

class MDP(object):
  def __init__(self, gamma):
    self.left_transitions = self.make_table()
    self.right_transitions = self.make_table()
    self.tables = lambda : [left_transitions, right_transitions]
    self.gamma = gamma

  def make_table(self):
    table = []
    for start_state in range(3):
      probabilities = [random.random() for x in range(3)]
      row = []
      for end_state in range(3):
        row.append((probabilities[end_state] / sum(probabilities,
                                                      random.random())))
      table.append(row)
    return table

  def solve_q_values(self):
    return solve_v_values(self.table(), self.gamma)

# Table is an array with dimension num_actions * num_states * num_states
# each entry is a tuple of (probability, reward)

simple_problem = [[[
  (0.5, 1), # from 0 to 0
  (0.5, 0)],  # from 0 to 1
 [(0.7, 0.5), # from 1 to 0
  (0.3, 3)]]] # from 1 to 1

other_simple_problem = [[[
  (1, 1), # from 0 to 0
  (0, 1)],  # from 0 to 1
 [(1, 0), # from 1 to 0
  (0, 0)]]]

trivial_problem = [[[(1, 1)]]]

def solve_v_values(table, gamma, optimal = True):
  number_of_states = len(table[0])
  number_of_actions = len(table)

  values = [0] * number_of_states
  policy = [0] * number_of_states

  delta = 1
  while delta > 0.00001:
    delta = 0

    # policy evaluation
    for state in range(number_of_states):
      old_value = values[state]
      new_value = 0
      action = policy[state]
      for new_state in range(number_of_states):
        prob, reward = table[action][state][new_state]
        new_value += prob * (reward + gamma * values[new_state])
      values[state] = new_value
      delta = max(delta, abs(new_value - old_value))

    if optimal:
      # policy improvement
      for state in range(number_of_states):
        best_val = 0
        best_action = 0

        for action in range(number_of_actions):
          val = 0

          for new_state in range(number_of_states):
            prob, reward = table[action][state][new_state]
            val += prob * values[new_state]

          if val > best_val:
            best_action = action
            best_val = val

        policy[state] = best_action

  return (values, policy)

def solve_q_values(table, gamma):
  number_of_states = len(table[0])
  number_of_actions = len(table)
  values, policy = solve_v_values

print solve_v_values(other_simple_problem, 0.1)

# Want to generate a random MDP with three states {a,b,c}, two actions {x,y}, and rewards in [0,1].

# Need two random 3x3 matrices with entries T^i_{jk} specifying the probability of moving from state j to state k given action i.

# Also need two random 3x3 matrices with entries R^i_{jk} specifying the reward gained when transitioning from state a to state b given action c. (Rewards are deterministic here, randomly generated rewards that are a stochastic function of start state, end state, and action would be better)

# Require that |Q*(a,i) - Q*(b,i)| < epsilon for all actions i.

# Want a joint probability distribution on {a,b} called B(a) and B(b).

# Reduced MDP has two states: {ab,c}, two actions {x,y}, and rewards in [0,1]. Transition probabilities are (lhs is reduced MDP, rhs is old mdp):
# p(ab,r|c,i) = p(a,r|c,i) + p(b,r|c,i)
# p(c,r|c,i) = p(c,r|c,i)
# p(c,r|ab,i) = B(a)p(c,r|a,i) + B(b)p(c,r|b,i)
# p(ab,r|ab,i) = B(a)(p(a,r|a,i) + p(b,r|b,i)) + B(b)(p(a,r|b,i) + p(b,r|b,i))

# Solve for optimal value V* of each state in the original MDP

# Solve for optimal policy in the reduced MDP

# Solve for value V^P of each state in the original MDP where you in states a and b you execute the optimal policy for state ab of the reduced MDP, and in state c you execute the optimal policy for state c of the reduced MDP.

# Return the max difference V* - V^P where states are varied.

# Repeat a bajillion times. Return the maximum difference V* - V^P, and the parameters T^i_{jk}, R^i_{jk}, and B(a).
	import numpy
	import numpy.random as nprand
	import random

	# Numbers epsilon \in [0,1] and gamma \in [0,1) entered in by hand at start

	# epsilon = raw_input("Enter epsilon \\in [0,1]")
	# gamma = raw_input("Enter epsilon \\in [0,1)")

	# num_trials = raw_input("how many trials do you want?")

	class MDP(object):
	def __init__(self, gamma):
	self.left_transitions = self.make_table()
	self.right_transitions = self.make_table()
	self.tables = lambda : [left_transitions, right_transitions]
	self.gamma = gamma

	def make_table(self):
	table = []
	for start_state in range(3):
	probabilities = [random.random() for x in range(3)]
	row = []
	for end_state in range(3):
	row.append((probabilities[end_state] / sum(probabilities,
	random.random())))
	table.append(row)
	return table

	def solve_q_values(self):
	return solve_v_values(self.table(), self.gamma)

	# Table is an array with dimension num_actions * num_states * num_states
	# each entry is a tuple of (probability, reward)

	simple_problem = [[[
	(0.5, 1), # from 0 to 0
	(0.5, 0)], # from 0 to 1
	[(0.7, 0.5), # from 1 to 0
	(0.3, 3)]]] # from 1 to 1

	other_simple_problem = [[[
	(1, 1), # from 0 to 0
	(0, 1)], # from 0 to 1
	[(1, 0), # from 1 to 0
	(0, 0)]]]

	trivial_problem = [[[(1, 1)]]]

	def solve_v_values(table, gamma, optimal = True):
	number_of_states = len(table[0])
	number_of_actions = len(table)

	values = [0] * number_of_states
	policy = [0] * number_of_states

	delta = 1
	while delta > 0.00001:
	delta = 0

	# policy evaluation
	for state in range(number_of_states):
	old_value = values[state]
	new_value = 0
	action = policy[state]
	for new_state in range(number_of_states):
	prob, reward = table[action][state][new_state]
	new_value += prob * (reward + gamma * values[new_state])
	values[state] = new_value
	delta = max(delta, abs(new_value - old_value))

	if optimal:
	# policy improvement
	for state in range(number_of_states):
	best_val = 0
	best_action = 0

	for action in range(number_of_actions):
	val = 0

	for new_state in range(number_of_states):
	prob, reward = table[action][state][new_state]
	val += prob * values[new_state]

	if val > best_val:
	best_action = action
	best_val = val

	policy[state] = best_action

	return (values, policy)

	def solve_q_values(table, gamma):
	number_of_states = len(table[0])
	number_of_actions = len(table)
	values, policy = solve_v_values

	print solve_v_values(other_simple_problem, 0.1)

	# Want to generate a random MDP with three states {a,b,c}, two actions {x,y}, and rewards in [0,1].

	# Need two random 3x3 matrices with entries T^i_{jk} specifying the probability of moving from state j to state k given action i.

	# Also need two random 3x3 matrices with entries R^i_{jk} specifying the reward gained when transitioning from state a to state b given action c. (Rewards are deterministic here, randomly generated rewards that are a stochastic function of start state, end state, and action would be better)

	# Require that \|Q(a,i) - Q(b,i)\| < epsilon for all actions i.

	# Want a joint probability distribution on {a,b} called B(a) and B(b).

	# Reduced MDP has two states: {ab,c}, two actions {x,y}, and rewards in [0,1]. Transition probabilities are (lhs is reduced MDP, rhs is old mdp):
	# p(ab,r\|c,i) = p(a,r\|c,i) + p(b,r\|c,i)
	# p(c,r\|c,i) = p(c,r\|c,i)
	# p(c,r\|ab,i) = B(a)p(c,r\|a,i) + B(b)p(c,r\|b,i)
	# p(ab,r\|ab,i) = B(a)(p(a,r\|a,i) + p(b,r\|b,i)) + B(b)(p(a,r\|b,i) + p(b,r\|b,i))

	# Solve for optimal value V* of each state in the original MDP

	# Solve for optimal policy in the reduced MDP

	# Solve for value V^P of each state in the original MDP where you in states a and b you execute the optimal policy for state ab of the reduced MDP, and in state c you execute the optimal policy for state c of the reduced MDP.

	# Return the max difference V* - V^P where states are varied.

	# Repeat a bajillion times. Return the maximum difference V* - V^P, and the parameters T^i_{jk}, R^i_{jk}, and B(a).