Created
April 30, 2019 05:24
-
-
Save Gananath/c75f32f406ca39cade72aef66ee84029 to your computer and use it in GitHub Desktop.
Gym cartpole DQN in tensorflow keras
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Author: Gananath R | |
## Deep Q-Network for gym in tensorflow keras | |
from collections import deque | |
import tensorflow as tf | |
import numpy as np | |
import random | |
import gym | |
k=tf.keras | |
tf.set_random_seed(2019) | |
np.random.seed(2019) | |
def act(state,exploration_rate): | |
if np.random.rand() <= exploration_rate: | |
return random.randrange(n_actions) | |
act_values = model.predict(state) | |
return np.argmax(act_values[0]) | |
def remember(state, action, reward, next_state, done): | |
memory.append((state, action, reward, next_state, done)) | |
def replay(sample_batch_size): | |
if len(memory) < sample_batch_size: | |
return | |
sample_batch = random.sample(memory, sample_batch_size) | |
for state, action, reward, next_state, done in sample_batch: | |
target = reward | |
if not done: | |
target = reward + gamma * np.amax(model.predict(next_state)[0]) | |
target_f = model.predict(state) | |
target_f[0][action] = target | |
model.fit(state, target_f, epochs=1, verbose=0) | |
def average(prev_mean,new_value,count): | |
#https://math.stackexchange.com/a/750517/23624 | |
return (prev_mean*(count-1)+new_value)/count | |
#enviornment | |
env = gym.make('CartPole-v0') | |
env.seed(2019) | |
sample_batch_size = 32 | |
episodes = 300 | |
exploration_rate=1 | |
exploration_min = 0.01 | |
exploration_decay = 0.995 | |
gamma = 0.95 | |
memory = deque(maxlen=500) | |
n_actions=env.action_space.n | |
n_states=env.observation_space.shape[0] | |
model=k.models.Sequential() | |
model.add(k.layers.Dense(5, input_dim=n_states, activation='relu')) | |
model.add(k.layers.Dense(5, activation='relu')) | |
model.add(k.layers.Dense(n_actions, activation='linear')) | |
model.compile(loss='mse', optimizer=k.optimizers.Adam(lr=0.001)) | |
#model.summary() | |
model.input_shape | |
model.output_shape | |
avg_reward=0 | |
for episode in range(1,episodes): | |
state=env.reset() | |
state = np.reshape(state, [1, n_states]) | |
done = False | |
total_reward=0 | |
while not done: | |
action = act(state,exploration_rate) | |
next_state, reward, done, _ = env.step(action) | |
next_state = np.reshape(next_state, [1,n_states]) | |
remember(state, action, reward, next_state, done) | |
# target = reward + gamma * np.amax(model.predict(next_state)[0]) | |
# target_f = model.predict(state) | |
# target_f[0][action] = target | |
# model.fit(state, target_f, epochs=1, verbose=0) | |
state=next_state | |
total_reward=reward+total_reward | |
avg_reward=average(avg_reward,total_reward,episode) | |
if done: | |
replay(sample_batch_size) | |
if episode%20==0: | |
print("Episode: "+str(episode)+" Total Reward: "+str(total_reward)+\ | |
" Avg Reward: "+str(avg_reward)) | |
if exploration_rate > exploration_min: | |
exploration_rate *= exploration_decay | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment