Skip to content

Instantly share code, notes, and snippets.

@GuilhermeGSousa
Created June 19, 2018 20:13
Show Gist options
  • Save GuilhermeGSousa/81ba4785664470ee0af9860af56c1326 to your computer and use it in GitHub Desktop.
Save GuilhermeGSousa/81ba4785664470ee0af9860af56c1326 to your computer and use it in GitHub Desktop.
Implementention of a DQN to solve the Cartpole environment
import time
import random
import gym
import math
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
class DQN():
def __init__(self, env, max_env_steps=None, gamma=0.99,
epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.8,
alpha=0.05, alpha_decay=0.01, batch_size=128, quiet=False):
self.env = env
self.memory = []
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_log_decay
self.alpha = alpha
self.alpha_decay = alpha_decay
self.batch_size = batch_size
self.quiet = quiet
self.model = Sequential()
self.model.add(Dense(24, input_dim=4, activation='tanh'))
self.model.add(Dense(48, activation='tanh'))
self.model.add(Dense(2, activation='linear'))
self.model.compile(loss='mse', optimizer=Adam(lr=self.alpha, decay=self.alpha_decay))
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state, step = None):
if step is not None:
epsilon = max(self.epsilon_min, min(self.epsilon, 1.0 - math.log10((step + 1) * self.epsilon_decay)))
return self.env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.model.predict(state))
else:
return np.argmax(self.model.predict(state))
def replay(self):
batch_size = self.batch_size
x_batch, y_batch = [], []
np.random.shuffle(self.memory)
batches = []
for i in range(0, len(self.memory), self.batch_size):
batches.append(self.memory[i:i + self.batch_size])
for b in batches:
for state, action, reward, next_state, done in b:
y_target = self.model.predict(state)
y_target[0][action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0])
x_batch.append(state[0])
y_batch.append(y_target[0])
self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
self.memory = []
if __name__ == '__main__':
env = gym.make('CartPole-v0')
env.reset()
MAX_STEPS = 2000
STEP_TIME = 0.03
N_EP = 5000
COMBO_WIN_EP = 100
agent = DQN(env)
scores = []
good_tries_combo = 0
for e in range(N_EP):
state = np.reshape(env.reset(), [1, 4])
score = 0
for t in range(MAX_STEPS):
action = agent.act(state, e)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1, 4])
agent.remember(state, action, reward, next_state, done)
state = next_state
score += reward
if done:
break
if score >= 195:
good_tries_combo += 1
else:
good_tries_combo = 0
scores.append(score)
mean_score = np.mean(scores[-100:])
if mean_score >= 195:
print('Solved after {} episodes.format(e - 100))
break
if e % 100 == 0:
print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))
agent.replay()
## Play Solved
while True:
input("Press enter to show magic")
state = np.reshape(env.reset(), [1, 4])
while True:
action = agent.act(state)
state, _, done, _ = env.step(action)
state = np.reshape(state, [1, 4])
env.render()
time.sleep(STEP_TIME)
if done:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment