Skip to content

Instantly share code, notes, and snippets.

@Gananath
Created April 30, 2019 05:24
Show Gist options
  • Save Gananath/c75f32f406ca39cade72aef66ee84029 to your computer and use it in GitHub Desktop.
Save Gananath/c75f32f406ca39cade72aef66ee84029 to your computer and use it in GitHub Desktop.
Gym cartpole DQN in tensorflow keras
## Author: Gananath R
## Deep Q-Network for gym in tensorflow keras
from collections import deque
import tensorflow as tf
import numpy as np
import random
import gym
k=tf.keras
tf.set_random_seed(2019)
np.random.seed(2019)
def act(state,exploration_rate):
if np.random.rand() <= exploration_rate:
return random.randrange(n_actions)
act_values = model.predict(state)
return np.argmax(act_values[0])
def remember(state, action, reward, next_state, done):
memory.append((state, action, reward, next_state, done))
def replay(sample_batch_size):
if len(memory) < sample_batch_size:
return
sample_batch = random.sample(memory, sample_batch_size)
for state, action, reward, next_state, done in sample_batch:
target = reward
if not done:
target = reward + gamma * np.amax(model.predict(next_state)[0])
target_f = model.predict(state)
target_f[0][action] = target
model.fit(state, target_f, epochs=1, verbose=0)
def average(prev_mean,new_value,count):
#https://math.stackexchange.com/a/750517/23624
return (prev_mean*(count-1)+new_value)/count
#enviornment
env = gym.make('CartPole-v0')
env.seed(2019)
sample_batch_size = 32
episodes = 300
exploration_rate=1
exploration_min = 0.01
exploration_decay = 0.995
gamma = 0.95
memory = deque(maxlen=500)
n_actions=env.action_space.n
n_states=env.observation_space.shape[0]
model=k.models.Sequential()
model.add(k.layers.Dense(5, input_dim=n_states, activation='relu'))
model.add(k.layers.Dense(5, activation='relu'))
model.add(k.layers.Dense(n_actions, activation='linear'))
model.compile(loss='mse', optimizer=k.optimizers.Adam(lr=0.001))
#model.summary()
model.input_shape
model.output_shape
avg_reward=0
for episode in range(1,episodes):
state=env.reset()
state = np.reshape(state, [1, n_states])
done = False
total_reward=0
while not done:
action = act(state,exploration_rate)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1,n_states])
remember(state, action, reward, next_state, done)
# target = reward + gamma * np.amax(model.predict(next_state)[0])
# target_f = model.predict(state)
# target_f[0][action] = target
# model.fit(state, target_f, epochs=1, verbose=0)
state=next_state
total_reward=reward+total_reward
avg_reward=average(avg_reward,total_reward,episode)
if done:
replay(sample_batch_size)
if episode%20==0:
print("Episode: "+str(episode)+" Total Reward: "+str(total_reward)+\
" Avg Reward: "+str(avg_reward))
if exploration_rate > exploration_min:
exploration_rate *= exploration_decay
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment