Skip to content

Instantly share code, notes, and snippets.

@KireinaHoro
Created May 6, 2018 17:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KireinaHoro/74897b12ad24ea237289c251dcd62ee4 to your computer and use it in GitHub Desktop.
Save KireinaHoro/74897b12ad24ea237289c251dcd62ee4 to your computer and use it in GitHub Desktop.
FrozenLake test tensorflow
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
tf.reset_default_graph()
# establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, .01))
Quot = tf.matmul(inputs1, W)
predict = tf.argmax(Quot, 1)
# loss by taking sum square difference between the target and prediction Q
nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Quot))
trainer = tf.train.GradientDescentOptimizer(learning_rate=.1)
update_model = trainer.minimize(loss)
# train the network
init = tf.global_variables_initializer()
# learning parameters
gamma = .99
e = .1
num_episodes = 2000
# create lists to contain total rewards and steps per episode
j_list = []
r_list = []
with tf.Session() as sess:
sess.run(init)
for i in range(num_episodes):
# reset environment and get first observation
s, r_all, d, j = env.reset(), 0, False, 0
# the Q-network
while j < 99:
j += 1
# choose an action greedily (with e chance of random action)
# from the Q-network
a, allQ = sess.run([predict, Quot], feed_dict={inputs1: np.identity(16)[s:s + 1]})
if np.random.rand(1) < e:
a[0] = env.action_space.sample()
# get new state and reward from environment
s1, r, d, _ = env.step(a[0])
# obtain Q' values by feeding the new state through the network
Q1 = sess.run(Quot, feed_dict={inputs1: np.identity(16)[s1:s1 + 1]})
# obtain maxQ' and set target value for chosen action
maxQ1 = np.max(Q1)
targetQ = allQ
targetQ[0, a[0]] = r + gamma * maxQ1
# train network using target and predicted Q values
_, W1 = sess.run([update_model, W], feed_dict={inputs1: np.identity(16)[s:s + 1],
nextQ: targetQ})
r_all += r
s = s1
if d:
# reduce chance of random actions as we train the model
e = 1. / ((i / 50) + 10)
break
j_list.append(j)
r_list.append(r_all)
print('Percent of successful episodes: ' + str(sum(r_list) / num_episodes) + '%')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment