Skip to content

Instantly share code, notes, and snippets.

Created May 12, 2016 09:32
Show Gist options
  • Save anonymous/70afe80acc3810cc6df50747b63b9203 to your computer and use it in GitHub Desktop.
Save anonymous/70afe80acc3810cc6df50747b63b9203 to your computer and use it in GitHub Desktop.
import gym
import pandas as pd
import numpy as np
import random
# https://gym.openai.com/envs/CartPole-v0
# Carlos Aguayo - carlos.aguayo@gmail.com
class QLearner(object):
def __init__(self,
num_states=100,
num_actions=4,
alpha=0.2,
gamma=0.9,
epsilon=0.5,
epsilon_decay=0.99):
self.num_states = num_states
self.num_actions = num_actions
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.state = 0
self.action = 0
self.qtable = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
def set_initial_state(self, state):
"""
@summary: Sets the initial state and returns an action
@param state: The initial state
@returns: The selected action
"""
self.state = state
self.action = self.qtable[state].argsort()[-1]
return self.action
def move(self, state_prime, reward):
"""
@summary: Moves to the given state with given reward and returns action
@param state_prime: The new state
@param reward: The reward
@returns: The selected action
"""
alpha = self.alpha
gamma = self.gamma
state = self.state
action = self.action
qtable = self.qtable
choose_random_action = (1 - self.epsilon) <= np.random.uniform(0, 1)
if choose_random_action:
action_prime = random.randint(0, self.num_actions - 1)
else:
action_prime = self.qtable[state_prime].argsort()[-1]
self.epsilon *= self.epsilon_decay
qtable[state, action] = (1 - alpha) * qtable[state, action] + alpha * (reward + gamma * qtable[state_prime, action_prime])
self.state = state_prime
self.action = action_prime
return self.action
def cart_pole_with_qlearning():
env = gym.make('CartPole-v0')
goal_average_steps = 195
max_number_of_steps = 200
number_of_iterations_to_average = 100
number_of_features = env.observation_space.shape[0]
last_time_steps = np.ndarray(0)
cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
pole_angle_bins = pd.cut([-0.5, 0.5], bins=10, retbins=True)[1][1:-1]
cart_velocity_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
angle_rate_bins = pd.cut([-4, 4], bins=10, retbins=True)[1][1:-1]
list_reward = []
list_i = []
def build_state(features):
return int("".join(map(lambda feature: str(int(feature)), features)))
def to_bin(value, bins):
return np.digitize(x=[value], bins=bins)[0]
learner = QLearner(num_states=10 ** number_of_features,
num_actions=env.action_space.n,
alpha=0.2,
gamma=0.1,
epsilon=0.5,
epsilon_decay=0.99)
for episode in xrange(50000):
observation = env.reset()
cart_position, cart_velocity, pole_angle, angle_rate_of_change = observation
state = build_state([to_bin(cart_position, cart_position_bins),
to_bin(pole_angle, pole_angle_bins),
to_bin(cart_velocity, cart_velocity_bins),
to_bin(angle_rate_of_change, angle_rate_bins)])
action = learner.set_initial_state(state)
for step in xrange(max_number_of_steps - 1):
env.render()
observation, reward, done, info = env.step(action)
cart_position, cart_velocity, pole_angle, angle_rate_of_change = observation
state_prime = build_state([to_bin(cart_position, cart_position_bins),
to_bin(pole_angle, pole_angle_bins),
to_bin(cart_velocity, cart_velocity_bins),
to_bin(angle_rate_of_change, angle_rate_bins)])
if done:
reward = -100
action = learner.move(state_prime, reward)
if done:
last_time_steps = np.append(last_time_steps, [int(step + 1)])
if len(last_time_steps) > number_of_iterations_to_average:
last_time_steps = np.delete(last_time_steps, 0)
break
if last_time_steps.mean() > goal_average_steps:
print "Goal reached!"
print "Episodes before solve: ", episode + 1
print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(),
unichr(177), # plus minus sign
last_time_steps.std())
break
if __name__ == "__main__":
cart_pole_with_qlearning()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment