Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Following simple Q-learning tutorial
# Following along sample from this page
# https://www.oreilly.com/learning/introduction-to-reinforcement-learning-and-openai-gym
# with a few modifications to look at what's going on under the hood.
import gym
import numpy as np
import matplotlib.pyplot as plt
env = gym.make("Taxi-v2")
env.reset()
history = list()
counter = 0
done = False
Q = np.zeros([env.observation_space.n, env.action_space.n])
G = 0
alpha = 0.618 # Don't understand where this value came from yet
for episode in range (1,1001):
done = False
G, reward = 0,0
state = env.reset()
while done != True:
action = np.argmax(Q[state])
state2, reward, done, info = env.step(action)
Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action])
G += reward
state = state2
if episode % 50 == 0:
print('Episode {} Total Reward: {}'.format(episode,G))
history.append(G)
# print(np.count_nonzero(Q))
plt.plot(history)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.