Skip to content

Instantly share code, notes, and snippets.

@sksq96
Forked from carlos-aguayo/CartPole-v0.py
Created January 7, 2017 10:05
Show Gist options
  • Save sksq96/8b628f6304d370d93f0ac07ee5b77364 to your computer and use it in GitHub Desktop.
Save sksq96/8b628f6304d370d93f0ac07ee5b77364 to your computer and use it in GitHub Desktop.
OpenAI Gym CartPole-v0
Cart pole balancing solved using the Q learning algorithm.
https://gym.openai.com/envs/CartPole-v0
https://gym.openai.com/evaluations/eval_kWknKOkPQ7izrixdhriurA
To run:
python CartPole-v0.py
import gym
import pandas as pd
import numpy as np
import random
# https://gym.openai.com/envs/CartPole-v0
# Carlos Aguayo - carlos.aguayo@gmail.com
class QLearner(object):
def __init__(self,
num_states=100,
num_actions=4,
alpha=0.2,
gamma=0.9,
random_action_rate=0.5,
random_action_decay_rate=0.99):
self.num_states = num_states
self.num_actions = num_actions
self.alpha = alpha
self.gamma = gamma
self.random_action_rate = random_action_rate
self.random_action_decay_rate = random_action_decay_rate
self.state = 0
self.action = 0
self.qtable = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
def set_initial_state(self, state):
"""
@summary: Sets the initial state and returns an action
@param state: The initial state
@returns: The selected action
"""
self.state = state
self.action = self.qtable[state].argsort()[-1]
return self.action
def move(self, state_prime, reward):
"""
@summary: Moves to the given state with given reward and returns action
@param state_prime: The new state
@param reward: The reward
@returns: The selected action
"""
alpha = self.alpha
gamma = self.gamma
state = self.state
action = self.action
qtable = self.qtable
choose_random_action = (1 - self.random_action_rate) <= np.random.uniform(0, 1)
if choose_random_action:
action_prime = random.randint(0, self.num_actions - 1)
else:
action_prime = self.qtable[state_prime].argsort()[-1]
self.random_action_rate *= self.random_action_decay_rate
qtable[state, action] = (1 - alpha) * qtable[state, action] + alpha * (reward + gamma * qtable[state_prime, action_prime])
self.state = state_prime
self.action = action_prime
return self.action
def cart_pole_with_qlearning():
env = gym.make('CartPole-v0')
experiment_filename = './cartpole-experiment-1'
env.monitor.start(experiment_filename, force=True)
goal_average_steps = 195
max_number_of_steps = 200
number_of_iterations_to_average = 100
number_of_features = env.observation_space.shape[0]
last_time_steps = np.ndarray(0)
cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1]
pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1]
cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1]
angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1]
def build_state(features):
return int("".join(map(lambda feature: str(int(feature)), features)))
def to_bin(value, bins):
return np.digitize(x=[value], bins=bins)[0]
learner = QLearner(num_states=10 ** number_of_features,
num_actions=env.action_space.n,
alpha=0.2,
gamma=1,
random_action_rate=0.5,
random_action_decay_rate=0.99)
for episode in xrange(50000):
observation = env.reset()
cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation
state = build_state([to_bin(cart_position, cart_position_bins),
to_bin(pole_angle, pole_angle_bins),
to_bin(cart_velocity, cart_velocity_bins),
to_bin(angle_rate_of_change, angle_rate_bins)])
action = learner.set_initial_state(state)
for step in xrange(max_number_of_steps - 1):
observation, reward, done, info = env.step(action)
cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation
state_prime = build_state([to_bin(cart_position, cart_position_bins),
to_bin(pole_angle, pole_angle_bins),
to_bin(cart_velocity, cart_velocity_bins),
to_bin(angle_rate_of_change, angle_rate_bins)])
if done:
reward = -200
action = learner.move(state_prime, reward)
if done:
last_time_steps = np.append(last_time_steps, [int(step + 1)])
if len(last_time_steps) > number_of_iterations_to_average:
last_time_steps = np.delete(last_time_steps, 0)
break
if last_time_steps.mean() > goal_average_steps:
print "Goal reached!"
print "Episodes before solve: ", episode + 1
print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(),
unichr(177), # plus minus sign
last_time_steps.std())
break
env.monitor.close()
if __name__ == "__main__":
random.seed(0)
cart_pole_with_qlearning()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment