Create a gist now

Instantly share code, notes, and snippets.

Embed
from collections import defaultdict
import math
import numpy as np
import gym
# Author Andrea Sessa
# This is a modified version of https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg
# It uses Q-Learning in place of Temporal Difference, parameters have been tuned to achieve
# the maximum score in about 1000 less episodes
FIFTY_DEGREES_IN_RADIANS = 0.87266
class QLearner:
def __init__(self, env):
self.env = env
self.epsilon = 0.75
self.Q = []
self.gamma = 0.95
self.max_episodes = 4000
self.max_step = 3000
self.alpha = 0.5
self.eps_decay = 0.99
# Stolen from https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg
def encode_state(self, state):
"""
Converts raw continuous state into one of discrete states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c)
Args:
state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot.
Returns:
box (int): A discrete state.
"""
x, x_dot, theta, theta_dot = state
env = self.env
x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians
half_theta_limit = theta_limit/2
one_twelveth_theta_limit = theta_limit/12
cart_in_limits = -x_limit < x < x_limit
pole_in_limits = -theta_limit < theta < theta_limit
if not cart_in_limits or not pole_in_limits:
return 0
box = (1 if x < -0.8 else
2 if x < 0.8 else
3)
if x_dot < -0.5:
pass
elif x_dot < 0.5:
box += 3
else:
box += 6
if theta < -half_theta_limit:
pass
elif theta < -one_twelveth_theta_limit:
box += 9
elif theta < 0:
box += 18
elif theta < one_twelveth_theta_limit:
box += 27
elif theta < half_theta_limit:
box += 36
else:
box += 45
if theta_dot < -FIFTY_DEGREES_IN_RADIANS:
pass
elif theta_dot < FIFTY_DEGREES_IN_RADIANS:
box += 54
else:
box += 108
return box
# Epsilon-Greedy Policy: with probability 1-epsilon perform a random action
# otherwise pick the greedy action
def epsilon_greedy(self, state, q_values, eps):
a = np.argmax(q_values[state, :])
if np.random.rand() < eps:
a = np.random.randint(q_values.shape[1])
return a
def learn(self, render=True):
ave_cumu_r = None
n_s = 163
n_a = self.env.action_space.n
# Initialization of the actio-value function
self.Q = np.zeros(shape=(n_s,n_a))
for e in range(self.max_episodes):
cum_rw = 0
# Shall i show the pole?
if render:
self.env.render()
s = self.encode_state(self.env.reset())
for step in range(self.max_step):
a = self.epsilon_greedy(s, self.Q, self.epsilon)
sp, rw, done, _ = self.env.step(a)
sp = self.encode_state(sp)
# Q-Learning update rule
self.Q[s,a] = self.Q[s,a] + self.alpha*(rw + (self.gamma*max(self.Q[sp,:])) - self.Q[s,a])
s = sp
cum_rw = rw + self.gamma * cum_rw
#Terminal state!
if done:
k = 0.01
if ave_cumu_r is None:
ave_cumu_r = cum_rw
else:
ave_cumu_r = k*cum_rw + (1 - k)*ave_cumu_r
if cum_rw > ave_cumu_r:
self.epsilon *= self.eps_decay
print("Episode {} ended on step {} with reward: {}".format(e, step, ave_cumu_r))
break
def main():
env = gym.make('CartPole-v0')
env.monitor.start('/tmp/cartpole-experiment-1', force=True)
learner = QLearner(env)
learner.learn()
env.monitor.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment