Skip to content

Instantly share code, notes, and snippets.

@diegslva
Created May 28, 2017 22:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save diegslva/b45b6fc520f1baedb4375b9418932cc6 to your computer and use it in GitHub Desktop.
Save diegslva/b45b6fc520f1baedb4375b9418932cc6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 28 15:21:00 2017
@author: diegslva
Here we try use Q-learning to solve CartPole-v0 with quantize states
That means we going bin each states so that this set of states will be
discrete and finite
We update our reward to -300 to make our model to not go to far
and restrict our area using bins
"""
from __future__ import print_function, division
from builtins import range
import gym
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
# turn list of integers into an int
# ex.
# build_state([1,2,3,4,5]) -> 12345
def build_state(features):
"""get our features and put all together converting into an integer"""
return int("".join(map(lambda feature: str(int(feature)), features)))
#end build_state
def to_bin(value, bins):
""""""
return np.digitize(x=[value], bins=bins)[0]
#end to_bin
class FeatureTransformer:
def __init__(self):
# Note: to make this better you could look at how often each bin was
# actually used while running this script
# It's not clear from the high low values nor sample() what
# values we really expect here
self.cart_position_bins = np.linspace(-2.4, 2.4, 9)
self.cart_velocity_bins = np.linspace(-2, 2, 9) # (-inf, inf)
self.pole_angle_bins = np.linspace(-0.4, 0.4, 9)
self.pole_velocity_bins = np.linspace(-3.5, 3.5, 9) # (-inf, inf)
def transform(self, observation):
# return an int
cart_pos, cart_vel, pole_angle, pole_vel = observation
return build_state([
to_bin(cart_pos, self.cart_position_bins),
to_bin(cart_vel, self.cart_velocity_bins),
to_bin(pole_angle, self.pole_angle_bins),
to_bin(pole_vel, self.pole_velocity_bins)
])
#end FeatureTransformer
class Model:
def __init__(self, env, feature_transformer):
self.env = env
self.feature_transformer = feature_transformer
num_states = 10**env.observation_space.shape[0]
num_actions = env.action_space.n
self.Q = np.random.uniform(low=-1, high=1, size=(num_states, num_actions))
#end __init__
def predict(self, s):
x = self.feature_transformer.transform(s)
return self.Q[x]
#end predict
def update(self, s, a, G):
""" update our model using gradient descent """
x = self.feature_transformer.transform(s)
self.Q[x, a] += 10e-3*(G - self.Q[x, a])
#end update
def sample_action(self, s, eps):
if np.random.random() < eps:
return self.env.action_space.sample()
else:
p = self.predict(s)
return np.argmax(p)
#end sample_action
#end Model
def play_one(model, eps, gamma):
"""Play one episode
@return: totalreward
"""
observation = env.reset()
done = False
totalreward = 0
iters = 0
while not done and iters < 10000:
action = model.sample_action(observation, eps)
prev_observation = observation
observation, reward, done, info = env.step(action)
# accumulate our rewards
totalreward += reward
# if the pole fall down or step hit the limit 200
# we decrease our reward to -300
if done and iters < 199:
reward = -300
#update the model
G = reward + gamma*np.max(model.predict(observation))
model.update(prev_observation, action, G)
iters += 1
return totalreward
#end play_one
def plot_running_avg(totalrewards):
"""Plot running average for better view"""
N = len(totalrewards)
running_avg = np.empty(N)
for t in range(N):
running_avg[t] = totalrewards[max(0, t-100):(t+1)].mean()
plt.plot(running_avg)
plt.title("Running Average")
plt.show()
#end plot_running_avg
if __name__ == '__main__':
# initialize variables
recording = True
env = gym.make('CartPole-v0')
ft = FeatureTransformer()
model = Model(env, ft)
# discount rate
gamma = 0.9
if recording is True:
filename = os.path.basename(__file__).split('.')[0]
monitor_dir = './' + filename + '_' + str(datetime.now())
env = wrappers.Monitor(env, monitor_dir)
N = 10000
totalrewards = np.empty(N)
for n in range(N):
eps = 1.0/np.sqrt(n+1)
totalreward = play_one(model, eps, gamma)
totalrewards[n] = totalreward
if n % 100 == 0:
print("episode:",n,"total reward:", totalreward, "eps:", eps)
print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
print("total steps:", totalrewards.sum())
plt.plot(totalrewards)
plt.title("Rewards")
plot_running_avg(totalrewards)
# Submit to OpenAI Gym
env.close()
print("Uploading to gym...")
gym.scoreboard.api_key = "" # Put your key here
print("Results: " + str( gym.upload(monitor_dir)) )
#==============================================================================
#[2017-05-28 18:47:01,029] Making new env: CartPole-v0
# episode: 0 total reward: 29.0 eps: 1.0
# episode: 100 total reward: 30.0 eps: 0.099503719021
# ...
# episode: 9800 total reward: 200.0 eps: 0.010101010101
# episode: 9900 total reward: 200.0 eps: 0.0100498705962
# avg reward for last 100 episodes: 197.23
# total steps: 1834026.0
#==============================================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment