Created
May 28, 2017 22:12
-
-
Save diegslva/b45b6fc520f1baedb4375b9418932cc6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Sun May 28 15:21:00 2017 | |
@author: diegslva | |
Here we try use Q-learning to solve CartPole-v0 with quantize states | |
That means we going bin each states so that this set of states will be | |
discrete and finite | |
We update our reward to -300 to make our model to not go to far | |
and restrict our area using bins | |
""" | |
from __future__ import print_function, division | |
from builtins import range | |
import gym | |
import os | |
import sys | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from gym import wrappers | |
from datetime import datetime | |
# turn list of integers into an int | |
# ex. | |
# build_state([1,2,3,4,5]) -> 12345 | |
def build_state(features): | |
"""get our features and put all together converting into an integer""" | |
return int("".join(map(lambda feature: str(int(feature)), features))) | |
#end build_state | |
def to_bin(value, bins): | |
"""""" | |
return np.digitize(x=[value], bins=bins)[0] | |
#end to_bin | |
class FeatureTransformer: | |
def __init__(self): | |
# Note: to make this better you could look at how often each bin was | |
# actually used while running this script | |
# It's not clear from the high low values nor sample() what | |
# values we really expect here | |
self.cart_position_bins = np.linspace(-2.4, 2.4, 9) | |
self.cart_velocity_bins = np.linspace(-2, 2, 9) # (-inf, inf) | |
self.pole_angle_bins = np.linspace(-0.4, 0.4, 9) | |
self.pole_velocity_bins = np.linspace(-3.5, 3.5, 9) # (-inf, inf) | |
def transform(self, observation): | |
# return an int | |
cart_pos, cart_vel, pole_angle, pole_vel = observation | |
return build_state([ | |
to_bin(cart_pos, self.cart_position_bins), | |
to_bin(cart_vel, self.cart_velocity_bins), | |
to_bin(pole_angle, self.pole_angle_bins), | |
to_bin(pole_vel, self.pole_velocity_bins) | |
]) | |
#end FeatureTransformer | |
class Model: | |
def __init__(self, env, feature_transformer): | |
self.env = env | |
self.feature_transformer = feature_transformer | |
num_states = 10**env.observation_space.shape[0] | |
num_actions = env.action_space.n | |
self.Q = np.random.uniform(low=-1, high=1, size=(num_states, num_actions)) | |
#end __init__ | |
def predict(self, s): | |
x = self.feature_transformer.transform(s) | |
return self.Q[x] | |
#end predict | |
def update(self, s, a, G): | |
""" update our model using gradient descent """ | |
x = self.feature_transformer.transform(s) | |
self.Q[x, a] += 10e-3*(G - self.Q[x, a]) | |
#end update | |
def sample_action(self, s, eps): | |
if np.random.random() < eps: | |
return self.env.action_space.sample() | |
else: | |
p = self.predict(s) | |
return np.argmax(p) | |
#end sample_action | |
#end Model | |
def play_one(model, eps, gamma): | |
"""Play one episode | |
@return: totalreward | |
""" | |
observation = env.reset() | |
done = False | |
totalreward = 0 | |
iters = 0 | |
while not done and iters < 10000: | |
action = model.sample_action(observation, eps) | |
prev_observation = observation | |
observation, reward, done, info = env.step(action) | |
# accumulate our rewards | |
totalreward += reward | |
# if the pole fall down or step hit the limit 200 | |
# we decrease our reward to -300 | |
if done and iters < 199: | |
reward = -300 | |
#update the model | |
G = reward + gamma*np.max(model.predict(observation)) | |
model.update(prev_observation, action, G) | |
iters += 1 | |
return totalreward | |
#end play_one | |
def plot_running_avg(totalrewards): | |
"""Plot running average for better view""" | |
N = len(totalrewards) | |
running_avg = np.empty(N) | |
for t in range(N): | |
running_avg[t] = totalrewards[max(0, t-100):(t+1)].mean() | |
plt.plot(running_avg) | |
plt.title("Running Average") | |
plt.show() | |
#end plot_running_avg | |
if __name__ == '__main__': | |
# initialize variables | |
recording = True | |
env = gym.make('CartPole-v0') | |
ft = FeatureTransformer() | |
model = Model(env, ft) | |
# discount rate | |
gamma = 0.9 | |
if recording is True: | |
filename = os.path.basename(__file__).split('.')[0] | |
monitor_dir = './' + filename + '_' + str(datetime.now()) | |
env = wrappers.Monitor(env, monitor_dir) | |
N = 10000 | |
totalrewards = np.empty(N) | |
for n in range(N): | |
eps = 1.0/np.sqrt(n+1) | |
totalreward = play_one(model, eps, gamma) | |
totalrewards[n] = totalreward | |
if n % 100 == 0: | |
print("episode:",n,"total reward:", totalreward, "eps:", eps) | |
print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) | |
print("total steps:", totalrewards.sum()) | |
plt.plot(totalrewards) | |
plt.title("Rewards") | |
plot_running_avg(totalrewards) | |
# Submit to OpenAI Gym | |
env.close() | |
print("Uploading to gym...") | |
gym.scoreboard.api_key = "" # Put your key here | |
print("Results: " + str( gym.upload(monitor_dir)) ) | |
#============================================================================== | |
#[2017-05-28 18:47:01,029] Making new env: CartPole-v0 | |
# episode: 0 total reward: 29.0 eps: 1.0 | |
# episode: 100 total reward: 30.0 eps: 0.099503719021 | |
# ... | |
# episode: 9800 total reward: 200.0 eps: 0.010101010101 | |
# episode: 9900 total reward: 200.0 eps: 0.0100498705962 | |
# avg reward for last 100 episodes: 197.23 | |
# total steps: 1834026.0 | |
#============================================================================== | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment