-
-
Save martinholub/c4860006d0cf3fbe87a79a054a9c98cd to your computer and use it in GitHub Desktop.
import gym | |
import gym.spaces | |
gym.logger.set_level(40) | |
gym.__version__ | |
dapprox = grad(approx) | |
discount = 1.0 # Discount rate | |
epsilon = 0.2 # Exploration rate | |
alpha = 0.1 # Step size for gradient descent | |
w = np.zeros((4,2)) # Initalize weigths | |
num_episodes = 1000 # Number of games for the agent to play | |
max_steps = 200 # Be fair and dont harvest points above solution limit | |
env = gym.make('CartPole-v0') | |
episode_rewards = [] | |
for ep in range(num_episodes): | |
state = env.reset() | |
rewards = [] | |
for _ in range(max_steps): | |
# Take smart action based on defined policy | |
action = policy(env, w, state, epsilon) | |
q_hat = approx(w, state, action) | |
q_hat_grad = dapprox(w, state, action) | |
next_state, reward, done, _ = env.step(action) | |
rewards.append(reward) | |
# Render into buffer. | |
env.render() | |
if done: | |
w += alpha*(reward - q_hat) * q_hat_grad | |
break | |
else: | |
# Update weights to maximize for discounted reward | |
next_action = policy(env, w, next_state, epsilon) | |
q_hat_next = approx(w, next_state, next_action) | |
w += alpha*(reward - discount*q_hat_next)*q_hat_grad | |
state = next_state | |
# Exploration / Explotation Tradeoff | |
# as we learn more about the game, become more certain in making decision | |
if ep == 100: | |
epsilon /= 2 | |
episode_rewards.append(np.sum(rewards)) | |
mean_reward=np.mean(episode_rewards[max(ep-100, 0):ep+1]) | |
# Report on progress - did we solve the task already? | |
if mean_reward >= 195.0 and ep >= 100: | |
print("Episodes before solve {}".format(ep-100+1)) | |
break | |
if ((ep % 100) == 0) and ep > 0: | |
print("Episode {}/{} finished. Mean reward over last 100 episodes: {:.2f}"\ | |
.format(ep, num_episodes, (mean_reward))) | |
env.close() |
import autograd.numpy as np | |
from autograd import grad, elementwise_grad | |
import random | |
# Linear approximation function to expected returns | |
def approx(weights, observation, action): | |
return np.dot(observation, weights)[action] | |
# Random or Learned Policy, selected by epsilon | |
def policy(env, weights, observation, epsilon): | |
actions = [0, 1] | |
if np.random.rand() < epsilon: | |
return random.choice(actions) | |
qs = [] | |
for action in actions: | |
qs.append(approx(weights, observation, action)) | |
return np.argmax(qs) |
I have the same question. But I tested if it is changed to
w += alpha*(reward + discount*q_hat_next - q_hat)*q_hat_grad.
then it never converge. Anybody have ideas?
Sorry for delayed reply.
Could you compare with some of these: https://github.com/openai/gym/wiki/Leaderboard#cartpole-v0?
Yes, I actually compared with @ceteke's code in the leader board: https://github.com/ceteke/RL/blob/master/Approximation/Linear%20Sarsa.ipynb
Your code is pretty much same as his, both used the same update:
w += alpha*(reward - discount*q_hat_next)*q_hat_grad
But I cannot figure out what's the idea behind it.
Can @martinholub comment on this? I'm coming across the same issue and don't understand why the update rule is w += alpha*(reward - discount*q_hat_next)*q_hat_grad
rather than w += alpha*(reward + discount*q_hat_next - q_hat)*q_hat_grad
.
Line35:
w += alpha*(reward - discount*q_hat_next)*q_hat_grad
Is this correct?
Personally, it is supposed to be w += alpha*(reward + discount*q_hat_next - q_hat)*q_hat_grad.
Feel free to correct me if I'm wrong.