Instantly share code, notes, and snippets.

# Sessa93/QCartPole.py Last active Aug 25, 2016

 from collections import defaultdict import math import numpy as np import gym # Author Andrea Sessa # This is a modified version of https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg # It uses Q-Learning in place of Temporal Difference, parameters have been tuned to achieve # the maximum score in about 1000 less episodes FIFTY_DEGREES_IN_RADIANS = 0.87266 class QLearner: def __init__(self, env): self.env = env self.epsilon = 0.75 self.Q = [] self.gamma = 0.95 self.max_episodes = 4000 self.max_step = 3000 self.alpha = 0.5 self.eps_decay = 0.99 # Stolen from https://gym.openai.com/evaluations/eval_g0U5ZP6YQyyA6mV5fDuKg def encode_state(self, state): """ Converts raw continuous state into one of discrete states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c) Args: state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot. Returns: box (int): A discrete state. """ x, x_dot, theta, theta_dot = state env = self.env x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians half_theta_limit = theta_limit/2 one_twelveth_theta_limit = theta_limit/12 cart_in_limits = -x_limit < x < x_limit pole_in_limits = -theta_limit < theta < theta_limit if not cart_in_limits or not pole_in_limits: return 0 box = (1 if x < -0.8 else 2 if x < 0.8 else 3) if x_dot < -0.5: pass elif x_dot < 0.5: box += 3 else: box += 6 if theta < -half_theta_limit: pass elif theta < -one_twelveth_theta_limit: box += 9 elif theta < 0: box += 18 elif theta < one_twelveth_theta_limit: box += 27 elif theta < half_theta_limit: box += 36 else: box += 45 if theta_dot < -FIFTY_DEGREES_IN_RADIANS: pass elif theta_dot < FIFTY_DEGREES_IN_RADIANS: box += 54 else: box += 108 return box # Epsilon-Greedy Policy: with probability 1-epsilon perform a random action # otherwise pick the greedy action def epsilon_greedy(self, state, q_values, eps): a = np.argmax(q_values[state, :]) if np.random.rand() < eps: a = np.random.randint(q_values.shape[1]) return a def learn(self, render=True): ave_cumu_r = None n_s = 163 n_a = self.env.action_space.n # Initialization of the actio-value function self.Q = np.zeros(shape=(n_s,n_a)) for e in range(self.max_episodes): cum_rw = 0 # Shall i show the pole? if render: self.env.render() s = self.encode_state(self.env.reset()) for step in range(self.max_step): a = self.epsilon_greedy(s, self.Q, self.epsilon) sp, rw, done, _ = self.env.step(a) sp = self.encode_state(sp) # Q-Learning update rule self.Q[s,a] = self.Q[s,a] + self.alpha*(rw + (self.gamma*max(self.Q[sp,:])) - self.Q[s,a]) s = sp cum_rw = rw + self.gamma * cum_rw #Terminal state! if done: k = 0.01 if ave_cumu_r is None: ave_cumu_r = cum_rw else: ave_cumu_r = k*cum_rw + (1 - k)*ave_cumu_r if cum_rw > ave_cumu_r: self.epsilon *= self.eps_decay print("Episode {} ended on step {} with reward: {}".format(e, step, ave_cumu_r)) break def main(): env = gym.make('CartPole-v0') env.monitor.start('/tmp/cartpole-experiment-1', force=True) learner = QLearner(env) learner.learn() env.monitor.close() if __name__ == "__main__": main()