Skip to content

Instantly share code, notes, and snippets.

@kfaRabi
Created August 18, 2017 10:31
Show Gist options
  • Save kfaRabi/ff0c910e484fd26be9dfb6923ad8694a to your computer and use it in GitHub Desktop.
Save kfaRabi/ff0c910e484fd26be9dfb6923ad8694a to your computer and use it in GitHub Desktop.
FrozenLake-v0 solution using Q-Learning without any discount.
import matplotlib
import numpy as np
import sys
import os
from collections import defaultdict
import gym
cwd = os.getcwd()
gymPath = cwd[:cwd.find("gym") + len("gym")]
# print gymPath
if gymPath not in sys.path:
sys.path.append(gymPath)
if gymPath + '/practice' not in sys.path:
sys.path.append(gymPath + '/practice')
from lib import plotting
from gym import wrappers
def make_epsilon_greedy_policy(Q, epsilon, nA):
"""
Creates an epsilon-greedy policy based on a given Q-function and epsilon.
Args:
Q: A dictionary that maps from state -> action-values.
Each value is a numpy array of length nA (see below)
epsilon: The probability to select a random action . float between 0 and 1.
nA: Number of actions in the environment.
Returns:
A function that takes the observation as an argument and returns
the probabilities for each action in the form of a numpy array of length nA.
"""
def policy_fn(observation):
A = np.ones(nA, dtype=float) * epsilon / nA
best_action = np.argmax(Q[observation])
A[best_action] += (1.0 - epsilon)
return A
return policy_fn
def q_learning(env, num_episodes, discount_factor=1.0):
"""
Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
while following an epsilon-greedy policy
Args:
env: OpenAI environment.
num_episodes: Number of episodes to run for.
discount_factor: Lambda time discount factor.
Returns:
A tuple (Q, stats).
Q is the optimal action-value function, a dictionary mapping state -> action values.
stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
"""
# The final action-value function.
# A nested dictionary that maps state -> (action -> action-value).
Q = defaultdict(lambda: np.zeros(env.action_space.n))
#Number of times each state-action pair sampled
cnt = defaultdict(lambda: np.zeros(env.action_space.n))
# Keeps track of useful statistics
stats = plotting.EpisodeStats(
episode_lengths=np.zeros(num_episodes),
episode_rewards=np.zeros(num_episodes))
# grndttl = 0.
for i_episode in range(num_episodes):
# Chance that the action will be a random one
epsilon = 1. / (10 + i_episode)
# The Policy we're following
policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
# Print out which episode we're on, useful for debugging.
if (i_episode + 1) % 500 == 0:
print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes))
sys.stdout.flush()
# print epsilon
t = 0
total_reward = 0.
s = env.reset()
# env.render()
while True:
a = np.random.choice(np.arange(env.action_space.n), p = policy(s))
cnt[s][a] += 1.0
# TD learning rate
alpha = 1. / cnt[s][a]
sPrime, r, d, _ = env.step(a)
# env.render()
rr = 0.
if d:
rr = 1 if r > 0. else -1
else:
rr = 0
Q[s][a] = Q[s][a] + alpha * (rr + (discount_factor * np.max(Q[sPrime])) - Q[s][a])
total_reward += r
t += 1
s = sPrime
if d:
break
stats.episode_lengths[i_episode] = t
stats.episode_rewards[i_episode] = total_reward
# grndttl += total_reward
return Q, stats
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, '/home/rabi/Home_Drive/Research_Project/RL/gym/practice/Toy_Text_Gym/fl-ex1', force = True)
num_epi = 2000
Q, stats = q_learning(env, num_epi)
best_hundred = -100000.0
#Get an idea about the avg of the best hundred episodes before submitting on gym
for i in range(num_epi - 100):
best_hundred = max(best_hundred, np.sum(stats.episode_rewards[i : i + 100]))
print "Avg. reward for best 100 eps:", best_hundred / 100.
plotting.plot_episode_stats(stats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment