Skip to content

Instantly share code, notes, and snippets.

@evangravelle
Created August 1, 2016 05:22
Show Gist options
  • Save evangravelle/667b995ca5f3887b385a3c84f1e9afff to your computer and use it in GitHub Desktop.
Save evangravelle/667b995ca5f3887b385a3c84f1e9afff to your computer and use it in GitHub Desktop.
Acrobot-v0 solved with SARSA-lambda using Gaussian radial basis functions
# SARSA-lambda with Gaussian radial basis functions for action-value approximation
# Implemented for the OpenAI gym acrobot environment
# Written by Evan Gravelle
# 7/31/2016
import gym
import numpy as np
import matplotlib.pyplot as plt
# Initializations
env = gym.make('Acrobot-v0')
env.monitor.start('./tmp/acrobot-1', force=True)
num_actions = env.action_space.n
dim = env.observation_space.high.size
# Parameters
discrt = 4
num_rbf = discrt * np.ones(dim).astype(int)
width = 1. / (num_rbf - 1.)
rbf_sigma = width[0] / 2.
epsilon = 0.1
epsilon_final = 0.1
Lambda = 0.5
alpha = 0.012
gamma = 0.99
num_episodes = 1000
num_timesteps = 200
xbar = np.zeros((2, dim))
xbar[0, :] = env.observation_space.low
xbar[1, :] = env.observation_space.high
num_ind = np.prod(num_rbf)
activations = np.zeros(num_ind)
new_activations = np.zeros(num_ind)
theta = np.zeros((num_ind, num_actions))
rbf_den = 2 * rbf_sigma ** 2
epsilon_coefficient = (epsilon - epsilon_final) ** (1. / num_episodes)
ep_length = np.zeros(num_episodes)
np.set_printoptions(precision=2)
# Construct ndarray of rbf centers
c = np.zeros((num_ind, dim))
for i in range(num_ind):
if i == 0:
pad_num = dim
else:
pad_num = dim - int(np.log(i) / np.log(discrt)) - 1
ind = np.base_repr(i, base=discrt, padding=pad_num)
ind = np.asarray([float(j) for j in list(ind)])
c[i, :] = width * ind
# Returns the state scaled between 0 and 1
def normalize_state(_s):
_y = np.zeros(len(_s))
for _i in range(len(_s)):
_y[_i] = (_s[_i] - xbar[0, _i]) / (xbar[1, _i] - xbar[0, _i])
return _y
# Returns an ndarray of radial basis function activations
def phi(_state):
_phi = np.zeros(num_ind)
for _k in range(num_ind):
_phi[_k] = np.exp(-np.linalg.norm(_state - c[_k, :]) ** 2 / rbf_den)
return _phi
# Returns an action following an epsilon-greedy policy
def epsilon_greedy(_epsilon, _vals):
_rand = np.random.random()
if _rand < 1. - _epsilon:
_action = _vals.argmax()
else:
_action = env.action_space.sample()
return int(_action)
# Returns the value of each action at some state
def action_values(_activations, _theta):
_val = np.dot(_theta.T, _activations)
return _val
# Returns the value of an action at some state
def action_value(_activations, _action, _theta):
_val = np.dot(_theta[:, _action], _activations)
return _val
# SARSA loop
for ep in range(num_episodes):
e = np.zeros((num_ind, num_actions))
state = normalize_state(env.reset())
activations = phi(state)
# print "activations = ", np.reshape(activations.ravel(order='F'), (num_rows, num_cols))
vals = action_values(activations, theta)
action = epsilon_greedy(epsilon, vals)
# Each episode
for t in range(num_timesteps):
env.render()
new_state, reward, done, info = env.step(action)
new_state = normalize_state(new_state)
new_activations = phi(new_state)
new_vals = action_values(new_activations, theta)
new_action = epsilon_greedy(epsilon, new_vals)
Q = action_value(activations, action, theta)
Q_new = action_value(new_activations, new_action, theta)
if done:
target = reward - Q
else:
target = reward + gamma * Q_new - Q
# e[:, action] += activations # accumulating traces
e[:, action] = activations # replacing traces
for k in range(num_ind):
for a in range(num_actions):
theta[k, a] += alpha * target * e[k, a]
e *= gamma * Lambda
if t % 1 != 0:
print "t = ", t
print "new_state = ", new_state
print "new_activations = ", np.reshape(new_activations.ravel(order='F'), (num_rows, num_cols))
print "new_vals", new_vals
print "Q = ", Q
print "Q_new = ", Q_new
print "action = ", action
print "target = ", target
print "e =", e
print "theta = \n", np.reshape(theta.ravel(order='F'), (num_actions, num_rows, num_cols))
print "---------------------------------------------------------------------------"
state = new_state.copy()
activations = new_activations.copy()
action = new_action
if done:
break
ep_length[ep] = t
epsilon *= epsilon_coefficient
plt.close('all')
plt.figure(1)
plt.plot(ep_length)
plt.title('Episode Length')
plt.ylabel('Completion Time')
plt.xlabel('Episode')
plt.show()
env.monitor.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment