kurnianggoro/openai_frozenlake4x4.py

## openai_frozenlake4x4.py
'''
Problem Descriptions:
https://gym.openai.com/envs/FrozenLake-v0
The agent should move to the goal position (G) from starting point (S) safely

environment is 4x4 grid of blocks,
F means frozen (safe), H means hole (unsafe)
example:

SFFF
FHFH
FFFH
HFFG

reward at every step is 0, except when entering G (1)

moves:
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
note: sometimes there is a windblows, makes the agent moves to unexpected place
probability to be slipped is 2/3
'''

import gym
from gym import wrappers
import numpy as np

CREATE_REPORT = 0

def modify_reward(reward, done):
    if done and reward == 0:
        return -100.0
    elif done:
        return 50.0
    else:
	return 1.0

#make the environment
env = gym.make('FrozenLake-v0')

#save the log (for submission)
logfile = '../openai_gym/forezenlake'
if CREATE_REPORT:
    env = wrappers.Monitor(env, logfile, force=True)

# for reproducibility
env.seed(0)
np.random.seed(0)

print(env.observation_space.n, env.action_space.n)

#Initialize table with all zeros
Q = np.zeros([env.observation_space.n, env.action_space.n])

#set parameters
lr = .05 #.85
y=.99
num_episodes = 5000

#create list to save the total rewards per episode
rList = []

#play episode
first = 0;
for i in range(num_episodes):
    #reset environment and get a new observation (the first state)
    s = env.reset()
    rAll = 0 #to save total reward
    done = False
    j=0 #current timestep

    #The Q-Table learning algorithm
    while not done: #j<99: # will try in 100 timesteps
        #env.render()
        j+=1 #increse the timestep

        #choose an action by greedily picking from Q table
        # add noise to simulate uncertainty, getting smaller as episode increased
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))

        #get new state and reward from environment by doing the action
        s_new, r, done, _ = env.step(a)

	r_mod = modify_reward(r,done)
        #update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r_mod+y*np.max(Q[s_new,:])-Q[s,a])

        #logging
        rAll+=r

        #quit if success or fall to the hole
        if done == True:
            if first == 0 and r != 0.0:
                first = i
                print(i)

            break

        #set new state for the next iteration
        s = s_new

    #outside while loop
    rList.append(rAll)

print "Score over time: " +  str(sum(rList)/num_episodes)

print "Final Q-Table Values"
print np.argmax(Q,1)

env.close()

def moving_average(x, n=100):
    x = x.cumsum()
    return (x[n:] - x[:-n]) / n

ma = moving_average(np.asarray(rList))
print "Best 100-episode average reward was %f." % ma.max()

solved = len(np.where(ma >= .78)[0])>0 # criteria set by openai
if solved:
	print "Solved after %d episodes." % np.where(ma >= .78)[0][0]
else:
	print "unsolved!"

#upload the result
if CREATE_REPORT:
    key = ''
    gym.upload(logfile, api_key=key)
	'''
	Problem Descriptions:
	https://gym.openai.com/envs/FrozenLake-v0
	The agent should move to the goal position (G) from starting point (S) safely

	environment is 4x4 grid of blocks,
	F means frozen (safe), H means hole (unsafe)
	example:

	SFFF
	FHFH
	FFFH
	HFFG

	reward at every step is 0, except when entering G (1)

	moves:
	LEFT = 0
	DOWN = 1
	RIGHT = 2
	UP = 3
	note: sometimes there is a windblows, makes the agent moves to unexpected place
	probability to be slipped is 2/3
	'''

	import gym
	from gym import wrappers
	import numpy as np

	CREATE_REPORT = 0

	def modify_reward(reward, done):
	if done and reward == 0:
	return -100.0
	elif done:
	return 50.0
	else:
	return 1.0

	#make the environment
	env = gym.make('FrozenLake-v0')

	#save the log (for submission)
	logfile = '../openai_gym/forezenlake'
	if CREATE_REPORT:
	env = wrappers.Monitor(env, logfile, force=True)

	# for reproducibility
	env.seed(0)
	np.random.seed(0)

	print(env.observation_space.n, env.action_space.n)

	#Initialize table with all zeros
	Q = np.zeros([env.observation_space.n, env.action_space.n])

	#set parameters
	lr = .05 #.85
	y=.99
	num_episodes = 5000

	#create list to save the total rewards per episode
	rList = []

	#play episode
	first = 0;
	for i in range(num_episodes):
	#reset environment and get a new observation (the first state)
	s = env.reset()
	rAll = 0 #to save total reward
	done = False
	j=0 #current timestep

	#The Q-Table learning algorithm
	while not done: #j<99: # will try in 100 timesteps
	#env.render()
	j+=1 #increse the timestep

	#choose an action by greedily picking from Q table
	# add noise to simulate uncertainty, getting smaller as episode increased
	a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))

	#get new state and reward from environment by doing the action
	s_new, r, done, _ = env.step(a)

	r_mod = modify_reward(r,done)
	#update Q-Table with new knowledge
	Q[s,a] = Q[s,a] + lr(r_mod+ynp.max(Q[s_new,:])-Q[s,a])

	#logging
	rAll+=r

	#quit if success or fall to the hole
	if done == True:
	if first == 0 and r != 0.0:
	first = i
	print(i)

	break

	#set new state for the next iteration
	s = s_new

	#outside while loop
	rList.append(rAll)

	print "Score over time: " + str(sum(rList)/num_episodes)

	print "Final Q-Table Values"
	print np.argmax(Q,1)

	env.close()

	def moving_average(x, n=100):
	x = x.cumsum()
	return (x[n:] - x[:-n]) / n

	ma = moving_average(np.asarray(rList))
	print "Best 100-episode average reward was %f." % ma.max()

	solved = len(np.where(ma >= .78)[0])>0 # criteria set by openai
	if solved:
	print "Solved after %d episodes." % np.where(ma >= .78)[0][0]
	else:
	print "unsolved!"

	#upload the result
	if CREATE_REPORT:
	key = ''
	gym.upload(logfile, api_key=key)