Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from __future__ import division
import gym
import numpy as np
import tflearn
from tflearn.data_utils import to_categorical
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn import lstm, embedding
from tflearn.layers.estimator import regression
import random
from collections import deque
NUM_ACTIONS = 2
NUM_STATES = 4
MAX_REPLAY_STATES = 100
NUM_GAMES_TRAIN = 200
def create_model(n_inputs, n_outputs):
network = input_data(shape = [None, n_inputs])
network = fully_connected(network, 128, activation = 'relu')
#network = dropout(network, 0.5)
network = fully_connected(network, 256 , activation = 'relu')
#network = dropout(network, 0.5)
network = fully_connected(network, 128, activation = 'relu')
#network = dropout(network, 0.5)
network = fully_connected(network, n_outputs, activation = 'softmax')
network = regression(network,
optimizer = 'adam',
loss = 'categorical_crossentropy')
model = tflearn.DNN(
network,
max_checkpoints = 0,
tensorboard_verbose = 0,
tensorboard_dir = 'logs'
)
return model
env = gym.make('CartPole-v0')
model = create_model(NUM_STATES, NUM_ACTIONS)
replay = deque([])
gamma = 0.9
epsilon = 1
for number_game in range(NUM_GAMES_TRAIN):
observation = env.reset()
reward_game = 0
print '[+] Game ' + str(number_game)
while True:
env.render()
q = model.predict([observation])[0]
if random.random() < epsilon:
action = np.random.randint(0, NUM_ACTIONS)
else:
action = np.argmax(q)
new_state, reward, done, info = env.step(action)
reward_game += reward
replay.append((new_state, reward, action, done, observation))
if len(replay) > MAX_REPLAY_STATES:
replay.popleft() # We take the oldest replay
X_train = []
Y_train = []
for rep in replay:
new_state, reward, action, done_rep, old_state = rep
old_q = model.predict([old_state])[0]
new_q = model.predict([new_state])[0]
max_new_q = np.max(new_q)
update_target = np.zeros(NUM_ACTIONS)
update_target[:] = old_q[:]
if done_rep:
update = reward
else:
update = (reward + (gamma * max_new_q))
update_target[action] = update
X_train.append(old_state)
Y_train.append(update_target)
model.fit(
X_train, Y_train,
validation_set = 0,
n_epoch = 1,
batch_size = MAX_REPLAY_STATES,
shuffle = True,
show_metric = False,
snapshot_step = 200,
snapshot_epoch = False,
run_id = 'carpole_rl'
)
if done or reward_game > 200:
break
print "[+] Game " + str(number_game) + " Reward " + str(reward_game)
if epsilon > 0.1:
epsilon -= (1 / 1000)
env.monitor.close()
model.save('model.tfl')
#gym.upload(
# '/tmp/cartpole-experiment-1',
# writeup = 'https://gist.github.com/gdb/',
# api_key = ''
#)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment