DataWraith/frozen_lake.py

## frozen_lake.py
import gym
import numpy as np

# This is a straightforwad implementation of SARSA for the FrozenLake OpenAI
# Gym testbed. I wrote it mostly to make myself familiar with the OpenAI gym;
# the SARSA algorithm was implemented pretty much from the Wikipedia page alone.

env = gym.make("FrozenLake-v0")

def choose_action(observation):
    return np.argmax(q_table[observation])

alpha = 0.4
gamma = 0.999

q_table = dict([(x, [1, 1, 1, 1]) for x in range(16)])
score = []

for i in range(10000):
    observation = env.reset()
    action = choose_action(observation)

    prev_observation = None
    prev_action      = None

    t = 0

    for t in range(2500):
        observation, reward, done, info = env.step(action)

        action = choose_action(observation)

        if not prev_observation is None:
            q_old = q_table[prev_observation][prev_action]
            q_new = q_old
            if done:
                q_new += alpha * (reward - q_old)
            else:
                q_new += alpha * (reward + gamma * q_table[observation][action] - q_old)

            new_table = q_table[prev_observation]
            new_table[prev_action] = q_new
            q_table[prev_observation] = new_table

        prev_observation = observation
        prev_action = action

        if done:
            if len(score) < 100:
                score.append(reward)
            else:
                score[i % 100] = reward

            print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(i, t, reward, np.mean(score)))
            break
	import gym
	import numpy as np

	# This is a straightforwad implementation of SARSA for the FrozenLake OpenAI
	# Gym testbed. I wrote it mostly to make myself familiar with the OpenAI gym;
	# the SARSA algorithm was implemented pretty much from the Wikipedia page alone.

	env = gym.make("FrozenLake-v0")

	def choose_action(observation):
	return np.argmax(q_table[observation])

	alpha = 0.4
	gamma = 0.999

	q_table = dict([(x, [1, 1, 1, 1]) for x in range(16)])
	score = []

	for i in range(10000):
	observation = env.reset()
	action = choose_action(observation)

	prev_observation = None
	prev_action = None

	t = 0

	for t in range(2500):
	observation, reward, done, info = env.step(action)

	action = choose_action(observation)

	if not prev_observation is None:
	q_old = q_table[prev_observation][prev_action]
	q_new = q_old
	if done:
	q_new += alpha * (reward - q_old)
	else:
	q_new += alpha * (reward + gamma * q_table[observation][action] - q_old)

	new_table = q_table[prev_observation]
	new_table[prev_action] = q_new
	q_table[prev_observation] = new_table

	prev_observation = observation
	prev_action = action

	if done:
	if len(score) < 100:
	score.append(reward)
	else:
	score[i % 100] = reward

	print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(i, t, reward, np.mean(score)))
	break