Created
May 6, 2018 17:16
-
-
Save KireinaHoro/74897b12ad24ea237289c251dcd62ee4 to your computer and use it in GitHub Desktop.
FrozenLake test tensorflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gym | |
import numpy as np | |
import tensorflow as tf | |
env = gym.make('FrozenLake-v0') | |
tf.reset_default_graph() | |
# establish the feed-forward part of the network used to choose actions | |
inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32) | |
W = tf.Variable(tf.random_uniform([16, 4], 0, .01)) | |
Quot = tf.matmul(inputs1, W) | |
predict = tf.argmax(Quot, 1) | |
# loss by taking sum square difference between the target and prediction Q | |
nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32) | |
loss = tf.reduce_sum(tf.square(nextQ - Quot)) | |
trainer = tf.train.GradientDescentOptimizer(learning_rate=.1) | |
update_model = trainer.minimize(loss) | |
# train the network | |
init = tf.global_variables_initializer() | |
# learning parameters | |
gamma = .99 | |
e = .1 | |
num_episodes = 2000 | |
# create lists to contain total rewards and steps per episode | |
j_list = [] | |
r_list = [] | |
with tf.Session() as sess: | |
sess.run(init) | |
for i in range(num_episodes): | |
# reset environment and get first observation | |
s, r_all, d, j = env.reset(), 0, False, 0 | |
# the Q-network | |
while j < 99: | |
j += 1 | |
# choose an action greedily (with e chance of random action) | |
# from the Q-network | |
a, allQ = sess.run([predict, Quot], feed_dict={inputs1: np.identity(16)[s:s + 1]}) | |
if np.random.rand(1) < e: | |
a[0] = env.action_space.sample() | |
# get new state and reward from environment | |
s1, r, d, _ = env.step(a[0]) | |
# obtain Q' values by feeding the new state through the network | |
Q1 = sess.run(Quot, feed_dict={inputs1: np.identity(16)[s1:s1 + 1]}) | |
# obtain maxQ' and set target value for chosen action | |
maxQ1 = np.max(Q1) | |
targetQ = allQ | |
targetQ[0, a[0]] = r + gamma * maxQ1 | |
# train network using target and predicted Q values | |
_, W1 = sess.run([update_model, W], feed_dict={inputs1: np.identity(16)[s:s + 1], | |
nextQ: targetQ}) | |
r_all += r | |
s = s1 | |
if d: | |
# reduce chance of random actions as we train the model | |
e = 1. / ((i / 50) + 10) | |
break | |
j_list.append(j) | |
r_list.append(r_all) | |
print('Percent of successful episodes: ' + str(sum(r_list) / num_episodes) + '%') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment