pwasiewi/frozenLake_v0_using_qLearning.py

## frozenLake_v0_using_qLearning.py
import matplotlib
import numpy as np
import sys
import os
from collections import defaultdict
import gym

cwd = os.getcwd()
gymPath = cwd[:cwd.find("gym") + len("gym")]
# print gymPath
if gymPath not in sys.path:
    sys.path.append(gymPath)
if gymPath + '/practice' not in sys.path:
    sys.path.append(gymPath + '/practice')

from lib import plotting


from gym import wrappers


def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.

    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.

    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=1.0):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy

    Args:
        env: OpenAI environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Lambda time discount factor.

    Returns:
        A tuple (Q, stats).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    #Number of times each state-action pair sampled
    cnt = defaultdict(lambda: np.zeros(env.action_space.n))

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

#     grndttl = 0.

    for i_episode in range(num_episodes):
        # Chance that the action will be a random one
        epsilon = 1. / (10 +  i_episode)
        # The Policy we're following
        policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 500 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes))
            sys.stdout.flush()
            # print epsilon

        t = 0
        total_reward = 0.
        s = env.reset()
        # env.render()
        while True:
            a = np.random.choice(np.arange(env.action_space.n), p = policy(s))

            cnt[s][a] += 1.0
            # TD learning rate
            alpha = 1. / cnt[s][a]

            sPrime, r, d, _ = env.step(a)
            # env.render()
            rr = 0.
            if d:
                rr = 1 if r > 0. else -1
            else:
                rr = 0

            Q[s][a] = Q[s][a] + alpha * (rr + (discount_factor * np.max(Q[sPrime])) - Q[s][a])

            total_reward += r
            t += 1
            s = sPrime

            if d:
                break

        stats.episode_lengths[i_episode] = t
        stats.episode_rewards[i_episode] = total_reward
#         grndttl += total_reward

    return Q, stats

env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, '/home/rabi/Home_Drive/Research_Project/RL/gym/practice/Toy_Text_Gym/fl-ex1', force = True)
num_epi = 2000
Q, stats = q_learning(env, num_epi)

best_hundred = -100000.0
#Get an idea about the avg of the best hundred episodes before submitting on gym
for i in range(num_epi - 100):
    best_hundred = max(best_hundred, np.sum(stats.episode_rewards[i : i + 100]))
print "Avg. reward for best 100 eps:", best_hundred / 100.

plotting.plot_episode_stats(stats)
	import matplotlib
	import numpy as np
	import sys
	import os
	from collections import defaultdict
	import gym

	cwd = os.getcwd()
	gymPath = cwd[:cwd.find("gym") + len("gym")]
	# print gymPath
	if gymPath not in sys.path:
	sys.path.append(gymPath)
	if gymPath + '/practice' not in sys.path:
	sys.path.append(gymPath + '/practice')

	from lib import plotting


	from gym import wrappers


	def make_epsilon_greedy_policy(Q, epsilon, nA):
	"""
	Creates an epsilon-greedy policy based on a given Q-function and epsilon.

	Args:
	Q: A dictionary that maps from state -> action-values.
	Each value is a numpy array of length nA (see below)
	epsilon: The probability to select a random action . float between 0 and 1.
	nA: Number of actions in the environment.

	Returns:
	A function that takes the observation as an argument and returns
	the probabilities for each action in the form of a numpy array of length nA.

	"""
	def policy_fn(observation):
	A = np.ones(nA, dtype=float) * epsilon / nA
	best_action = np.argmax(Q[observation])
	A[best_action] += (1.0 - epsilon)
	return A
	return policy_fn

	def q_learning(env, num_episodes, discount_factor=1.0):
	"""
	Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
	while following an epsilon-greedy policy

	Args:
	env: OpenAI environment.
	num_episodes: Number of episodes to run for.
	discount_factor: Lambda time discount factor.

	Returns:
	A tuple (Q, stats).
	Q is the optimal action-value function, a dictionary mapping state -> action values.
	stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
	"""

	# The final action-value function.
	# A nested dictionary that maps state -> (action -> action-value).
	Q = defaultdict(lambda: np.zeros(env.action_space.n))

	#Number of times each state-action pair sampled
	cnt = defaultdict(lambda: np.zeros(env.action_space.n))

	# Keeps track of useful statistics
	stats = plotting.EpisodeStats(
	episode_lengths=np.zeros(num_episodes),
	episode_rewards=np.zeros(num_episodes))

	# grndttl = 0.

	for i_episode in range(num_episodes):
	# Chance that the action will be a random one
	epsilon = 1. / (10 + i_episode)
	# The Policy we're following
	policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
	# Print out which episode we're on, useful for debugging.
	if (i_episode + 1) % 500 == 0:
	print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes))
	sys.stdout.flush()
	# print epsilon

	t = 0
	total_reward = 0.
	s = env.reset()
	# env.render()
	while True:
	a = np.random.choice(np.arange(env.action_space.n), p = policy(s))

	cnt[s][a] += 1.0
	# TD learning rate
	alpha = 1. / cnt[s][a]

	sPrime, r, d, _ = env.step(a)
	# env.render()
	rr = 0.
	if d:
	rr = 1 if r > 0. else -1
	else:
	rr = 0

	Q[s][a] = Q[s][a] + alpha * (rr + (discount_factor * np.max(Q[sPrime])) - Q[s][a])

	total_reward += r
	t += 1
	s = sPrime

	if d:
	break

	stats.episode_lengths[i_episode] = t
	stats.episode_rewards[i_episode] = total_reward
	# grndttl += total_reward

	return Q, stats

	env = gym.make('FrozenLake-v0')
	env = wrappers.Monitor(env, '/home/rabi/Home_Drive/Research_Project/RL/gym/practice/Toy_Text_Gym/fl-ex1', force = True)
	num_epi = 2000
	Q, stats = q_learning(env, num_epi)

	best_hundred = -100000.0
	#Get an idea about the avg of the best hundred episodes before submitting on gym
	for i in range(num_epi - 100):
	best_hundred = max(best_hundred, np.sum(stats.episode_rewards[i : i + 100]))
	print "Avg. reward for best 100 eps:", best_hundred / 100.

	plotting.plot_episode_stats(stats)