Skip to content

Instantly share code, notes, and snippets.

@tomykaira
Created February 4, 2019 10:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tomykaira/4aa2820db3aa27f40b3462bbe8fe4f21 to your computer and use it in GitHub Desktop.
Save tomykaira/4aa2820db3aa27f40b3462bbe8fe4f21 to your computer and use it in GitHub Desktop.
# Monte Carlo (virtual kodoku)
import gym
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import gym
import random
import math
init_notebook_mode(connected=True)
alpha = 0.01
gamma = 0.50
decay = 0.001
theta = np.random.uniform(low=0, high=1, size=(4))
env = gym.make('CartPole-v1')
data = []
theta_log = []
avg_delta_qs = []
theta_log.append([0, theta.copy(), 0])
turns = []
ans_turns = []
def avg(arr):
return sum(arr) / len(arr)
def q(state, action):
return (theta @ state) * action
def update_qtable(history):
global theta
lr = alpha * (1. / (1. + decay * len(avg_delta_qs)))
total_reward_t = 0
delta_qs = []
for i in range(len(history)-1, 0, -1):
ith = history[i]
total_reward_t = gamma * total_reward_t + ith[2]
old_q = q(ith[0], ith[1])
delta_q = lr * (total_reward_t - old_q)
delta_qs.append(delta_q * delta_q)
for i in range(4):
theta[i] += delta_q * ith[0][i]
avg_delta_q = avg(delta_qs)
avg_delta_qs.append(avg_delta_q)
#if len(avg_delta_qs) % 500 == 0:
# print(avg_delta_q)
#if len(avg_delta_qs) > 500 and avg(avg_delta_qs[-50:-1]) < avg_delta_q:
# return True
#else:
# return False
def make(state, episode):
# eps-greedy
if episode == -1:
epsilon = 0
else:
epsilon = 0.5 * (1 / (episode + 1))
if random.random() < epsilon:
return 1 if random.randrange(2) == 1 else -1
else:
return 1 if q(state, 1) >= q(state, -1) else -1
def learn():
global theta
theta = np.random.uniform(low=0, high=1, size=(4))
i = 0
while i < 5000:
obs = env.reset()
history = []
turn = 0
action = make(obs, i)
while True:
next_obs, reward, done, _ = env.step(1 if action == 1 else 0)
turn += 1
# no reward tweak because it is monte carlo
if done:
if turn < min(490, i / 10):
reward = -500
else:
reward = 1000
else:
reward = 0
history.append([obs, action, reward])
next_action = make(next_obs, i)
obs = next_obs
action = next_action
if done:
if turn < min(100, i / 50):
theta = np.random.uniform(low=0, high=1, size=(4))
i = 0
else:
update_qtable(history)
theta_log.append([i+1, theta.copy(), turn])
turns.append(turn)
i += 1
break
return [theta.copy(), avg(turns[-50:-1])]
results = []
for i in range(1):
results.append(learn())
theta = max(results, key=lambda x: x[1])[0]
epsilon = 0
for i in range(50):
obs = env.reset()
turn = 0
action = make(obs, -1)
while True:
next_obs, reward, done, _ = env.step(1 if action == 1 else 0)
turn += 1
next_action = make(next_obs, -1)
obs = next_obs
action = next_action
if done:
ans_turns.append(turn)
break
def plot_log():
plt = []
for i in range(4):
plt.append(go.Scatter(x=[x[0] for x in theta_log], y=[x[1][i] for x in theta_log]))
plt.append(go.Scatter(x=[x[0] for x in theta_log], y=[x[2] / 50 for x in theta_log]))
iplot(plt)
plot_log()
print([theta, turn, avg(ans_turns)])
if __name__ == '__main':
qs = []
ps = []
for i in range(10):
que = multiprocessing.Queue()
p = multiprocessing.Process(target=run, args=(que,))
p.start()
ps.append(p)
qs.append(que)
for p in ps:
p.join()
for que in qs:
print(que.get())
print("done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment