Skip to content

Instantly share code, notes, and snippets.

@blin00
Last active February 13, 2017 01:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blin00/ace75cd8e86ccf0d1b1f44962a2ed442 to your computer and use it in GitHub Desktop.
Save blin00/ace75cd8e86ccf0d1b1f44962a2ed442 to your computer and use it in GitHub Desktop.
super rough q-learning for cartpole-v0
#!/usr/bin/python3
import numpy as np
import gym
from gym import wrappers
import random
from collections import defaultdict
def process(obs):
return tuple(int(round(10 * x)) for x in obs)
qvals = defaultdict(lambda: [0, 0])
# entries are [action0reward, action1reward]
discount = 0.99
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, '/tmp/cartpole')
for i in range(10000):
explore = max(0, 0.5 - i / 10000)
learn = max(0.1, 0.5 - i / 10000)
new_obs = process(env.reset())
for t in range(1000):
obs = new_obs
if random.random() < explore:
action = random.randint(0, 1)
else:
action = np.argmax(qvals[obs])
new_obs, reward, done, info = env.step(action)
new_obs = process(new_obs)
qvals[obs][action] = (1 - learn) * qvals[obs][action] + learn * (reward + discount * max(qvals[new_obs]))
if done:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment