-
-
Save pwasiewi/25f3c160dba13ce62ae3c7eae88d633d to your computer and use it in GitHub Desktop.
FrozenLake-v0 solution using Q-Learning without any discount.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib | |
import numpy as np | |
import sys | |
import os | |
from collections import defaultdict | |
import gym | |
cwd = os.getcwd() | |
gymPath = cwd[:cwd.find("gym") + len("gym")] | |
# print gymPath | |
if gymPath not in sys.path: | |
sys.path.append(gymPath) | |
if gymPath + '/practice' not in sys.path: | |
sys.path.append(gymPath + '/practice') | |
from lib import plotting | |
from gym import wrappers | |
def make_epsilon_greedy_policy(Q, epsilon, nA): | |
""" | |
Creates an epsilon-greedy policy based on a given Q-function and epsilon. | |
Args: | |
Q: A dictionary that maps from state -> action-values. | |
Each value is a numpy array of length nA (see below) | |
epsilon: The probability to select a random action . float between 0 and 1. | |
nA: Number of actions in the environment. | |
Returns: | |
A function that takes the observation as an argument and returns | |
the probabilities for each action in the form of a numpy array of length nA. | |
""" | |
def policy_fn(observation): | |
A = np.ones(nA, dtype=float) * epsilon / nA | |
best_action = np.argmax(Q[observation]) | |
A[best_action] += (1.0 - epsilon) | |
return A | |
return policy_fn | |
def q_learning(env, num_episodes, discount_factor=1.0): | |
""" | |
Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy | |
while following an epsilon-greedy policy | |
Args: | |
env: OpenAI environment. | |
num_episodes: Number of episodes to run for. | |
discount_factor: Lambda time discount factor. | |
Returns: | |
A tuple (Q, stats). | |
Q is the optimal action-value function, a dictionary mapping state -> action values. | |
stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. | |
""" | |
# The final action-value function. | |
# A nested dictionary that maps state -> (action -> action-value). | |
Q = defaultdict(lambda: np.zeros(env.action_space.n)) | |
#Number of times each state-action pair sampled | |
cnt = defaultdict(lambda: np.zeros(env.action_space.n)) | |
# Keeps track of useful statistics | |
stats = plotting.EpisodeStats( | |
episode_lengths=np.zeros(num_episodes), | |
episode_rewards=np.zeros(num_episodes)) | |
# grndttl = 0. | |
for i_episode in range(num_episodes): | |
# Chance that the action will be a random one | |
epsilon = 1. / (10 + i_episode) | |
# The Policy we're following | |
policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) | |
# Print out which episode we're on, useful for debugging. | |
if (i_episode + 1) % 500 == 0: | |
print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes)) | |
sys.stdout.flush() | |
# print epsilon | |
t = 0 | |
total_reward = 0. | |
s = env.reset() | |
# env.render() | |
while True: | |
a = np.random.choice(np.arange(env.action_space.n), p = policy(s)) | |
cnt[s][a] += 1.0 | |
# TD learning rate | |
alpha = 1. / cnt[s][a] | |
sPrime, r, d, _ = env.step(a) | |
# env.render() | |
rr = 0. | |
if d: | |
rr = 1 if r > 0. else -1 | |
else: | |
rr = 0 | |
Q[s][a] = Q[s][a] + alpha * (rr + (discount_factor * np.max(Q[sPrime])) - Q[s][a]) | |
total_reward += r | |
t += 1 | |
s = sPrime | |
if d: | |
break | |
stats.episode_lengths[i_episode] = t | |
stats.episode_rewards[i_episode] = total_reward | |
# grndttl += total_reward | |
return Q, stats | |
env = gym.make('FrozenLake-v0') | |
env = wrappers.Monitor(env, '/home/rabi/Home_Drive/Research_Project/RL/gym/practice/Toy_Text_Gym/fl-ex1', force = True) | |
num_epi = 2000 | |
Q, stats = q_learning(env, num_epi) | |
best_hundred = -100000.0 | |
#Get an idea about the avg of the best hundred episodes before submitting on gym | |
for i in range(num_epi - 100): | |
best_hundred = max(best_hundred, np.sum(stats.episode_rewards[i : i + 100])) | |
print "Avg. reward for best 100 eps:", best_hundred / 100. | |
plotting.plot_episode_stats(stats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment