Roger-random/taxi_q.py

## taxi_q.py
# Following along sample from this page
# https://www.oreilly.com/learning/introduction-to-reinforcement-learning-and-openai-gym
# with a few modifications to look at what's going on under the hood.

import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("Taxi-v2")
env.reset()
history = list()

counter = 0
done = False
Q = np.zeros([env.observation_space.n, env.action_space.n])
G = 0
alpha = 0.618 # Don't understand where this value came from yet

for episode in range (1,1001):
  done = False
  G, reward = 0,0
  state = env.reset()
  while done != True:
    action = np.argmax(Q[state])
    state2, reward, done, info = env.step(action)
    Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action])
    G += reward
    state = state2
  if episode % 50 == 0:
    print('Episode {} Total Reward: {}'.format(episode,G))
  history.append(G)
  # print(np.count_nonzero(Q))

plt.plot(history)
plt.show()
	# Following along sample from this page
	# https://www.oreilly.com/learning/introduction-to-reinforcement-learning-and-openai-gym
	# with a few modifications to look at what's going on under the hood.

	import gym
	import numpy as np
	import matplotlib.pyplot as plt

	env = gym.make("Taxi-v2")
	env.reset()
	history = list()

	counter = 0
	done = False
	Q = np.zeros([env.observation_space.n, env.action_space.n])
	G = 0
	alpha = 0.618 # Don't understand where this value came from yet

	for episode in range (1,1001):
	done = False
	G, reward = 0,0
	state = env.reset()
	while done != True:
	action = np.argmax(Q[state])
	state2, reward, done, info = env.step(action)
	Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action])
	G += reward
	state = state2
	if episode % 50 == 0:
	print('Episode {} Total Reward: {}'.format(episode,G))
	history.append(G)
	# print(np.count_nonzero(Q))

	plt.plot(history)
	plt.show()