Last active
October 7, 2020 21:44
-
-
Save martinholub/c4860006d0cf3fbe87a79a054a9c98cd to your computer and use it in GitHub Desktop.
CartPole-v0 with SARSA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import gym.spaces | |
gym.logger.set_level(40) | |
gym.__version__ | |
dapprox = grad(approx) | |
discount = 1.0 # Discount rate | |
epsilon = 0.2 # Exploration rate | |
alpha = 0.1 # Step size for gradient descent | |
w = np.zeros((4,2)) # Initalize weigths | |
num_episodes = 1000 # Number of games for the agent to play | |
max_steps = 200 # Be fair and dont harvest points above solution limit | |
env = gym.make('CartPole-v0') | |
episode_rewards = [] | |
for ep in range(num_episodes): | |
state = env.reset() | |
rewards = [] | |
for _ in range(max_steps): | |
# Take smart action based on defined policy | |
action = policy(env, w, state, epsilon) | |
q_hat = approx(w, state, action) | |
q_hat_grad = dapprox(w, state, action) | |
next_state, reward, done, _ = env.step(action) | |
rewards.append(reward) | |
# Render into buffer. | |
env.render() | |
if done: | |
w += alpha*(reward - q_hat) * q_hat_grad | |
break | |
else: | |
# Update weights to maximize for discounted reward | |
next_action = policy(env, w, next_state, epsilon) | |
q_hat_next = approx(w, next_state, next_action) | |
w += alpha*(reward - discount*q_hat_next)*q_hat_grad | |
state = next_state | |
# Exploration / Explotation Tradeoff | |
# as we learn more about the game, become more certain in making decision | |
if ep == 100: | |
epsilon /= 2 | |
episode_rewards.append(np.sum(rewards)) | |
mean_reward=np.mean(episode_rewards[max(ep-100, 0):ep+1]) | |
# Report on progress - did we solve the task already? | |
if mean_reward >= 195.0 and ep >= 100: | |
print("Episodes before solve {}".format(ep-100+1)) | |
break | |
if ((ep % 100) == 0) and ep > 0: | |
print("Episode {}/{} finished. Mean reward over last 100 episodes: {:.2f}"\ | |
.format(ep, num_episodes, (mean_reward))) | |
env.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import autograd.numpy as np | |
from autograd import grad, elementwise_grad | |
import random | |
# Linear approximation function to expected returns | |
def approx(weights, observation, action): | |
return np.dot(observation, weights)[action] | |
# Random or Learned Policy, selected by epsilon | |
def policy(env, weights, observation, epsilon): | |
actions = [0, 1] | |
if np.random.rand() < epsilon: | |
return random.choice(actions) | |
qs = [] | |
for action in actions: | |
qs.append(approx(weights, observation, action)) | |
return np.argmax(qs) |
Yes, I actually compared with @ceteke's code in the leader board: https://github.com/ceteke/RL/blob/master/Approximation/Linear%20Sarsa.ipynb
Your code is pretty much same as his, both used the same update:
w += alpha*(reward - discount*q_hat_next)*q_hat_grad
But I cannot figure out what's the idea behind it.
Can @martinholub comment on this? I'm coming across the same issue and don't understand why the update rule is w += alpha*(reward - discount*q_hat_next)*q_hat_grad
rather than w += alpha*(reward + discount*q_hat_next - q_hat)*q_hat_grad
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sorry for delayed reply.
Could you compare with some of these: https://github.com/openai/gym/wiki/Leaderboard#cartpole-v0?