KireinaHoro/playground.py

## playground.py
import gym
import numpy as np
import tensorflow as tf

env = gym.make('FrozenLake-v0')

tf.reset_default_graph()

# establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, .01))
Quot = tf.matmul(inputs1, W)
predict = tf.argmax(Quot, 1)

# loss by taking sum square difference between the target and prediction Q
nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Quot))
trainer = tf.train.GradientDescentOptimizer(learning_rate=.1)
update_model = trainer.minimize(loss)

# train the network
init = tf.global_variables_initializer()

# learning parameters
gamma = .99
e = .1
num_episodes = 2000
# create lists to contain total rewards and steps per episode
j_list = []
r_list = []
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        # reset environment and get first observation
        s, r_all, d, j = env.reset(), 0, False, 0
        # the Q-network
        while j < 99:
            j += 1
            # choose an action greedily (with e chance of random action)
            # from the Q-network
            a, allQ = sess.run([predict, Quot], feed_dict={inputs1: np.identity(16)[s:s + 1]})
            if np.random.rand(1) < e:
                a[0] = env.action_space.sample()
            # get new state and reward from environment
            s1, r, d, _ = env.step(a[0])
            # obtain Q' values by feeding the new state through the network
            Q1 = sess.run(Quot, feed_dict={inputs1: np.identity(16)[s1:s1 + 1]})
            # obtain maxQ' and set target value for chosen action
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0, a[0]] = r + gamma * maxQ1
            # train network using target and predicted Q values
            _, W1 = sess.run([update_model, W], feed_dict={inputs1: np.identity(16)[s:s + 1],
                                                           nextQ: targetQ})
            r_all += r
            s = s1
            if d:
                # reduce chance of random actions as we train the model
                e = 1. / ((i / 50) + 10)
                break
        j_list.append(j)
        r_list.append(r_all)

print('Percent of successful episodes: ' + str(sum(r_list) / num_episodes) + '%')
	import gym
	import numpy as np
	import tensorflow as tf

	env = gym.make('FrozenLake-v0')

	tf.reset_default_graph()

	# establish the feed-forward part of the network used to choose actions
	inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32)
	W = tf.Variable(tf.random_uniform([16, 4], 0, .01))
	Quot = tf.matmul(inputs1, W)
	predict = tf.argmax(Quot, 1)

	# loss by taking sum square difference between the target and prediction Q
	nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
	loss = tf.reduce_sum(tf.square(nextQ - Quot))
	trainer = tf.train.GradientDescentOptimizer(learning_rate=.1)
	update_model = trainer.minimize(loss)

	# train the network
	init = tf.global_variables_initializer()

	# learning parameters
	gamma = .99
	e = .1
	num_episodes = 2000
	# create lists to contain total rewards and steps per episode
	j_list = []
	r_list = []
	with tf.Session() as sess:
	sess.run(init)
	for i in range(num_episodes):
	# reset environment and get first observation
	s, r_all, d, j = env.reset(), 0, False, 0
	# the Q-network
	while j < 99:
	j += 1
	# choose an action greedily (with e chance of random action)
	# from the Q-network
	a, allQ = sess.run([predict, Quot], feed_dict={inputs1: np.identity(16)[s:s + 1]})
	if np.random.rand(1) < e:
	a[0] = env.action_space.sample()
	# get new state and reward from environment
	s1, r, d, _ = env.step(a[0])
	# obtain Q' values by feeding the new state through the network
	Q1 = sess.run(Quot, feed_dict={inputs1: np.identity(16)[s1:s1 + 1]})
	# obtain maxQ' and set target value for chosen action
	maxQ1 = np.max(Q1)
	targetQ = allQ
	targetQ[0, a[0]] = r + gamma * maxQ1
	# train network using target and predicted Q values
	_, W1 = sess.run([update_model, W], feed_dict={inputs1: np.identity(16)[s:s + 1],
	nextQ: targetQ})
	r_all += r
	s = s1
	if d:
	# reduce chance of random actions as we train the model
	e = 1. / ((i / 50) + 10)
	break
	j_list.append(j)
	r_list.append(r_all)

	print('Percent of successful episodes: ' + str(sum(r_list) / num_episodes) + '%')