Skip to content

Instantly share code, notes, and snippets.

@uu64
Last active May 13, 2018 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uu64/71529c63b374a9103486395811fc77bf to your computer and use it in GitHub Desktop.
Save uu64/71529c63b374a9103486395811fc77bf to your computer and use it in GitHub Desktop.
import gym
import numpy as np
# parameter
# https://github.com/openai/gym/wiki/BipedalWalker-v2
num_episodes = 2000
num_trials = 1000
num_states = 3
num_actions = 160
num_dizitized = 20
sin_threshold = 0.91
def bins(min, max):
return np.linspace(min, max, num_dizitized + 1)[1:-1]
def digitize_state(observation):
sin, cos, theta_dot = observation
digitized = [
np.digitize(sin, bins=bins(-1.0, 1.0)),
np.digitize(cos, bins=bins(-1.0, 1.0)),
np.digitize(theta_dot, bins=bins(-8.0, 8.0))
]
return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])
def update_q_table(observation, action, reward, next_observation):
alpha = 0.6
gamma = 0.99
state = digitize_state(observation)
state_next = digitize_state(next_observation)
max_q_next = max(q_table[state_next][:])
q_table[state][action] = (1 - alpha)*q_table[state][action] + alpha*(reward + gamma*max_q_next)
def decide_action(observation, episode):
# epsilon-greedy method
state = digitize_state(observation)
epsilon = -0.0016*episode + 0.4
if epsilon <= np.random.uniform(0, 1):
action = np.argmax(q_table[state][:])
else:
action = np.random.choice(num_actions)
return action
def get_bonus(sin, t):
bonus = 0
if sin > 0.98:
bonus = (sin - 0.98)*5000
if sin < -0.98:
bonus = (sin + 0.98)*5000
return bonus
# initialize
env = gym.make('Pendulum-v0')
q_table = np.random.uniform(low=0, high=1, size=(num_dizitized**num_states, num_actions))
episode_sins = np.full(100, -1.0)
is_solved = False
for episode in range(num_episodes):
observation = env.reset()
sins = np.full(num_trials, -1.0)
count = 0
for t in range(num_trials):
if is_solved or episode%(num_episodes/8) == 0:
env.render()
action = decide_action(observation, episode)
next_observation, reward, done, info = env.step([4.0*(action/num_actions) - 2.0])
sins[t] = next_observation[0]
reward += get_bonus(sins[t], t)
update_q_table(observation, action, reward, next_observation)
observation = next_observation
if is_solved:
break
# check solved requirements
episode_sins[episode%episode_sins.size] = np.mean(sins)
print("Episode {0} finished. The average sin value is {1}.".format(episode+1, np.mean(episode_sins)))
if np.mean(episode_sins) > sin_threshold:
print("Episode {0} train agent successfuly!".format(episode+1))
is_solved = True
env.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment