Last active
October 30, 2019 14:01
-
-
Save joannapurosto/2d2892fa051219982a7799568dbe043b to your computer and use it in GitHub Desktop.
Q-learning tutorial part 3 and Deep Learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from enums import * | |
import random | |
import tensorflow as tf | |
import numpy as np | |
class DeepGambler: | |
def __init__(self, learning_rate=0.1, discount=0.95, exploration_rate=1.0, iterations=10000): | |
self.learning_rate = learning_rate | |
self.discount = discount # How much we appreciate future reward over current | |
self.exploration_rate = 1.0 # Initial exploration rate | |
self.exploration_delta = 1.0 / iterations # Shift from exploration to explotation | |
# Input has five neurons, each represents single game state (0-4) | |
self.input_count = 5 | |
# Output is two neurons, each represents Q-value for action (FORWARD and BACKWARD) | |
self.output_count = 2 | |
self.session = tf.Session() | |
self.define_model() | |
self.session.run(self.initializer) | |
# Define tensorflow model graph | |
def define_model(self): | |
# Input is an array of 5 items (state one-hot) | |
# Input is 2-dimensional, due to possibility of batched training data | |
# NOTE: In this example we assume no batching. | |
self.model_input = tf.placeholder(dtype=tf.float32, shape=[None, self.input_count]) | |
# Two hidden layers of 16 neurons with sigmoid activation initialized to zero for stability | |
fc1 = tf.layers.dense(self.model_input, 16, activation=tf.sigmoid, kernel_initializer=tf.constant_initializer(np.zeros((self.input_count, 16)))) | |
fc2 = tf.layers.dense(fc1, 16, activation=tf.sigmoid, kernel_initializer=tf.constant_initializer(np.zeros((16, self.output_count)))) | |
# Output is two values, Q for both possible actions FORWARD and BACKWARD | |
# Output is 2-dimensional, due to possibility of batched training data | |
# NOTE: In this example we assume no batching. | |
self.model_output = tf.layers.dense(fc2, self.output_count) | |
# This is for feeding training output (a.k.a ideal target values) | |
self.target_output = tf.placeholder(shape=[None, self.output_count], dtype=tf.float32) | |
# Loss is mean squared difference between current output and ideal target values | |
loss = tf.losses.mean_squared_error(self.target_output, self.model_output) | |
# Optimizer adjusts weights to minimize loss, with the speed of learning_rate | |
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(loss) | |
# Initializer to set weights to initial values | |
self.initializer = tf.global_variables_initializer() | |
# Ask model to estimate Q value for specific state (inference) | |
def get_Q(self, state): | |
# Model input: Single state represented by array of 5 items (state one-hot) | |
# Model output: Array of Q values for single state | |
return self.session.run(self.model_output, feed_dict={self.model_input: self.to_one_hot(state)})[0] | |
# Turn state into 2d one-hot tensor | |
# Example: 3 -> [[0,0,0,1,0]] | |
def to_one_hot(self, state): | |
one_hot = np.zeros((1, 5)) | |
one_hot[0, [state]] = 1 | |
return one_hot | |
def get_next_action(self, state): | |
if random.random() > self.exploration_rate: # Explore (gamble) or exploit (greedy) | |
return self.greedy_action(state) | |
else: | |
return self.random_action() | |
# Which action (FORWARD or BACKWARD) has bigger Q-value, estimated by our model (inference). | |
def greedy_action(self, state): | |
# argmax picks the higher Q-value and returns the index (FORWARD=0, BACKWARD=1) | |
return np.argmax(self.get_Q(state)) | |
def random_action(self): | |
return FORWARD if random.random() < 0.5 else BACKWARD | |
def train(self, old_state, action, reward, new_state): | |
# Ask the model for the Q values of the old state (inference) | |
old_state_Q_values = self.get_Q(old_state) | |
# Ask the model for the Q values of the new state (inference) | |
new_state_Q_values = self.get_Q(new_state) | |
# Real Q value for the action we took. This is what we will train towards. | |
old_state_Q_values[action] = reward + self.discount * np.amax(new_state_Q_values) | |
# Setup training data | |
training_input = self.to_one_hot(old_state) | |
target_output = [old_state_Q_values] | |
training_data = {self.model_input: training_input, self.target_output: target_output} | |
# Train | |
self.session.run(self.optimizer, feed_dict=training_data) | |
def update(self, old_state, new_state, action, reward): | |
# Train our model with new data | |
self.train(old_state, action, reward, new_state) | |
# Finally shift our exploration_rate toward zero (less gambling) | |
if self.exploration_rate > 0: | |
self.exploration_rate -= self.exploration_delta |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment