Skip to content

Instantly share code, notes, and snippets.

@ctmakro
Created January 17, 2017 06:55
Show Gist options
  • Save ctmakro/df15fc53d482ae91e41387a0448c6384 to your computer and use it in GitHub Desktop.
Save ctmakro/df15fc53d482ae91e41387a0448c6384 to your computer and use it in GitHub Desktop.
DDPG, train 64*50 every 50 steps, 1/f^2 noise added
from __future__ import print_function
# Deep Deterministic Policy Gradient Method
# David Silver et al.
# implemented in plain Keras, by Qin Yongliang
# 2017 01 13
# heavily optimized for speed, lots of numpy flowed into tensorflow
# 2017 01 14
'''
summary
0. s for state, a for action, r for reward,
q for 'action_quality', or expectation of sum of discounted future reward.
1. you have 2 network, Mr. actor and Mr. critic
- Mr. actor generate actions: a = actor(s)
- Mr. critic score (state,action) pairs: q = critic(s,a)
>in literature, Mr. actor is function mu(s), Mr. critic is function Q(s,a)
2. you improve Mr. critic by using Bellman equation, or what they call TD-learning
- Q(s1,a1) := r1 + gamma * Q(s2,a2) where a2 = actor(s2)
- train Mr. critic to predict the calculated Q(s1,a1) given s1 and a1, using gradient descent and MSE loss.
3. after that, improve Mr. actor by gradient ascent w.r.t. Q(s,a)
- a1_maybe = actor(s1), q1_maybe = critic(s1,a1_maybe)
- therefore q1_maybe = critic(s1,actor(s1)). we want to increase q1_maybe!!
- then figure out what is the gradient of actor w.r.t. q1_maybe,
using tf.gradient() or by compositing Keras Models (as I did, to keep things clean)
- then do gradient ascent to increase Mr. actor's actions' q-value
4. to stabilize the whole learning process:
- random sampling of training examples from replay memory
- use 'target' networks that are copy of actor and critic,
their weights gradually shift towards the weights of the real actor and critic
to reduce self-correlation/oscillation (well, if you know control theory)
- add noise to actor's output in the beginning of learning, to turn deterministic actions into probabilistic ones
- that's basically it
5. now go master the game of Gym
'''
'''
personal tricks:
check the Residual Dense Unit, it works!
'''
# gym boilerplate
import numpy as np
import gym
from gym import wrappers
from gym.spaces import Discrete, Box
# keras boilerplate: the simplest way to neural networking
from keras.models import *
from keras.layers import *
from keras.optimizers import *
import keras
from math import *
import random
import keras.backend as K
import time
# from winfrey import wavegraph
from rpm import rpm # replay memory implementation
from noise import one_fsq_noise
def bn(i):
return i
return BatchNormalization(mode=1)(i)
def relu(i):
return Activation('relu')(i)
# residual dense unit
def resdense(idim,odim):
def unit(i):
mdim = max(4,int(idim/4),int(odim/4))
if idim==odim:
ident = i
i = bn(i)
i = relu(i)
i = Dense(mdim)(i)
i = bn(i)
i = relu(i)
i = Dense(odim)(i)
else:
i = bn(i)
i = relu(i)
ident = i
i = Dense(mdim)(i)
i = bn(i)
i = relu(i)
i = Dense(odim)(i)
ident = Dense(odim)(ident)
out = merge([ident,i],mode='sum')
return out
return unit
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
ex = np.exp(x)
return ex / np.sum(ex, axis=0)
class nnagent(object):
def __init__(self,
observation_space,
action_space,
stack_factor=1,
discount_factor=.99, # gamma
optimizer=RMSprop(),
train_skip_every=1,
):
self.rpm = rpm(1000000) # 1M history
self.render = True
self.noise_source = one_fsq_noise()
self.train_counter = 0
self.train_skip_every = train_skip_every
self.observation_stack_factor = stack_factor
self.inputdims = observation_space.shape[0] * self.observation_stack_factor
# assume observation_space is continuous
self.is_continuous = True if isinstance(action_space,Box) else False
if self.is_continuous: # if action space is continuous
low = action_space.low
high = action_space.high
num_of_actions = action_space.shape[0]
self.action_bias = high/2. + low/2.
self.action_multiplier = high - self.action_bias
# say high,low -> [2,7], then bias -> 4.5
# mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7]
def clamper(actions):
return np.clip(actions,a_max=action_space.high,a_min=action_space.low)
self.clamper = clamper
else:
num_of_actions = action_space.n
self.action_bias = .5
self.action_multiplier = .5 # map (-1,1) into (0,1)
def clamper(actions):
return np.clip(actions,a_max=1.,a_min=0.)
self.clamper = clamper
self.outputdims = num_of_actions
self.discount_factor = discount_factor
self.optimizer = optimizer
ids,ods = self.inputdims,self.outputdims
self.actor, self.frozen_actor = self.create_actor_network(ids,ods)
self.critic, self.frozen_critic = self.create_critic_network(ids,ods)
print('inputdims:{}, outputdims:{}'.format(ids,ods))
print('actor network:')
self.actor.summary()
print('critic network:')
self.critic.summary()
# target networks: identical copies of actor and critic
self.actor_target,self.frozen_actor_target = self.create_actor_network(ids,ods)
self.critic_target, self.frozen_critic_target = self.create_critic_network(ids,ods)
self.replace_weights(tau=1.)
self.create_q1_target_model()
self.create_actor_trainer()
self.create_critic_trainer()
def create_actor_trainer(self):
# now the dirty part: the actor trainer --------------------------------
# explaination of this part is written in the train() method
s_given = Input(shape=(self.inputdims,))
a1_maybe = self.actor(s_given)
q1_maybe = self.frozen_critic([s_given,a1_maybe])
# frozen weight version of critic. so we can train only the actor
actor_trainer = Model(input=s_given,output=q1_maybe)
# use negative of q1_maybe as loss (so we can maximize q by minimizing the loss)
def neg_q1(y_true,y_pred):
return - y_pred # neat!
actor_trainer.compile(optimizer=self.optimizer,loss=neg_q1)
self.actor_trainer = actor_trainer
# dirty part ended -----------------------------------------------------
# (gradually) replace target network weights with online network weights
def _replace_weights(self,tau=0.001):
theta_a,theta_c = self.actor.get_weights(),self.critic.get_weights()
theta_a_targ,theta_c_targ = self.actor_target.get_weights(),self.critic_target.get_weights()
# mixing factor tau : we gradually shift the weights...
theta_a_targ = [theta_a[i]*tau + theta_a_targ[i]*(1-tau) for i in range(len(theta_a))]
theta_c_targ = [theta_c[i]*tau + theta_c_targ[i]*(1-tau) for i in range(len(theta_c))]
self.actor_target.set_weights(theta_a_targ)
self.critic_target.set_weights(theta_c_targ)
# the method above uses numpy, how can we flow it in tensorflow?
def replace_weights(self,tau=0.001):
if not hasattr(self,'wflow'):
self.wflow = self.weights_flow()
flow = self.wflow
tau = np.array([tau],dtype='float32')
flow([tau,0])
def weights_flow(self):
# define the weight replacing op
theta_a,theta_c = self.actor.weights,self.critic.weights
theta_a_targ,theta_c_targ = self.actor_target.weights,self.critic_target.weights
tau_place = K.placeholder(shape=(1,))
ops = []
for i,w in enumerate(theta_a_targ):
ops += [theta_a_targ[i].assign(theta_a[i]*tau_place + theta_a_targ[i]*(1-tau_place))]
for i,w in enumerate(theta_c_targ):
ops += [theta_c_targ[i].assign(theta_c[i]*tau_place + theta_c_targ[i]*(1-tau_place))]
flow = K.function([tau_place],ops)
return flow
# a = actor(s) : predict actions given state
def create_actor_network(self,inputdims,outputdims):
inp = Input(shape=(inputdims,))
i = inp
i = Dense(128)(i)
i = resdense(128,128)(i)
# i = resdense(128,128)(i)
i = relu(bn(i))
i = Dense(outputdims)(i)
if self.is_continuous:
# map into (-1,1)
i = Activation('tanh')(i)
# map into action_space
i = Lambda(lambda x:x * self.action_multiplier + self.action_bias)(i)
else:
# map into (0,1)
i = Activation('softmax')(i)
out = i
model = Model(input=inp,output=out)
# now we create a frozen_model,
# that uses the same layers with weights frozen when trained.
frozen_model = Model(input=inp,output=out)
frozen_model.trainable = False
return model,frozen_model
# q = critic(s,a) : predict q given state and action
def create_critic_network(self,inputdims,actiondims):
inp = Input(shape=(inputdims,))
act = Input(shape=(actiondims,))
i = merge([inp,act],mode='concat')
i = Dense(128)(i)
i = resdense(128,128)(i)
# i = resdense(128,128)(i)
i = relu(bn(i))
i = Dense(1)(i)
out = i
model = Model(input=[inp,act],output=out)
# now we create a frozen_model,
# that uses the same layers with weights frozen when trained.
frozen_model = Model(input=[inp,act],output=out)
frozen_model.trainable = False
return model,frozen_model
def create_q1_target_model(self):
# this part is for performance optimization
# for explaination of this part, please check train()
s2i = Input(shape=(self.inputdims,))
a2i = self.frozen_actor_target(s2i)
q2i = self.frozen_critic_target([s2i,a2i])
r1i = Input(shape=(1,))
isdonei = Input(shape=(1,))
def calc_q1_target(x):
[r1i,isdonei,q2i] = x
return r1i + (1-isdonei) * self.discount_factor * q2i
def calc_output_shape(input_shapes):
return input_shapes[0]
q1_target = merge([r1i,isdonei,q2i],mode=calc_q1_target,output_shape=calc_output_shape)
q1_target_model = Model(input=[s2i,r1i,isdonei],output=q1_target)
self.q1_target_model = q1_target_model
def create_critic_trainer(self):
# this part is also for performance optimization...
qtm = self.q1_target_model
qtm.trainable = False
s1i = Input(shape=(self.inputdims,))
s2i = Input(shape=(self.inputdims,))
a1i = Input(shape=(self.outputdims,))
r1i = Input(shape=(1,))
isdonei = Input(shape=(1,))
q1t = qtm([s2i,r1i,isdonei])
crit = self.critic([s1i,a1i])
def mse(x):
return (x[0]-x[1])**2
def calc_output_shape(input_shapes):
return input_shapes[0] # shape of r1i
loss = merge([q1t,crit],mode=mse,output_shape=calc_output_shape)
def thru(y_true,y_pred):
return y_pred
model = Model(input=[s1i,a1i,r1i,isdonei,s2i],output=loss)
model.compile(loss=thru,optimizer=self.optimizer)
self.critic_trainer = model
def train(self,verbose=1):
memory = self.rpm
critic,frozen_critic = self.critic,self.frozen_critic
actor = self.actor
batch_size = 64
total_size = batch_size * self.train_skip_every
epochs = 1
self.train_counter+=1
self.train_counter %= self.train_skip_every
if self.train_counter != 0: # train every few steps
return
if memory.size() > total_size:
#if enough samples in memory
# sample randomly a minibatch from memory
[s1,a1,r1,isdone,s2] = memory.sample_batch(total_size)
# print(s1.shape,a1.shape,r1.shape,isdone.shape,s2.shape)
if False: # the following is optimized away but kept for clarity.
# a2_targ = actor_targ(s2) : what will you do in s2, Mr. old actor?
a2 = self.actor_target.predict(s2)
# q2_targ = critic_targ(s2,a2) : how good is action a2, Mr. old critic?
q2 = self.critic_target.predict([s2,a2])
# what if we combine the 2 above to improve performance?
s2i = Input(shape=(self.inputdims,))
a2i = self.actor_target(s2i)
q2i = self.critic_target([s2i,a2i])
# if a2 is q2-good, then what should q1 be?
# Use Bellman Equation! (recursive definition of q-values)
# if not last step of episode:
# q1 = (r1 + gamma * q2)
# else:
# q1 = r1
q1_target = r1 + (1-isdone) * self.discount_factor * q2
# but, what if we combine all above to improve performance?
r1i = Input(shape=(1,))
isdonei = Input(shape=(1,))
def calc_q1_target(x):
[r1i,isdonei,q2i] = x
return r1i + (1-isdonei) * self.discount_factor * q2i
def calc_output_shape(input_shapes):
return input_shapes[0]
q1_target = merge([r1i,isdonei,q2i],mode=calc_q1_target,output_shape=calc_output_shape)
q1_target_model = Model(input=[s2i,r1i,isdonei],output=q1_target)
else:
# q1_target_model is already implemented in create_q1_target_model()
# q1_target = self.q1_target_model.predict([s2,r1,isdone])
# all above were optimized away...
critic_trainer = self.critic_trainer
# critic.fit([s1,a1],
# q1_target,
# batch_size=batch_size,
# nb_epoch=epochs,
# verbose=verbose,
# shuffle=False
# )
critic_trainer.fit([s1,a1,r1,isdone,s2],
np.zeros((total_size,1)), # useless target label
batch_size=batch_size,
nb_epoch=epochs,
verbose=verbose,
shuffle=False
)
# now the critic can predict more accurate q given s and a.
# thanks to the Bellman equation, and David Silver.
# with a better critic, we can now improve our actor!
if False: # the following part is optimized away. left here for explaination purposes
# a1_pred = actor(s1) : what will you do in s1, Mr. actor?
a1_maybe = actor.predict(s1)
# this action may not be optimal. now let's ask the critic.
# what do you think of Mr. actor's action on s1, Mr. better critic?
q1_maybe = critic.predict([s1,a1_maybe])
# what should we do to the actor, to increase q1_maybe?
# well, calculate the gradient of actor parameters
# w.r.t. q1_maybe, then do gradient ascent.
# so let's build a model that trains the actor to output higher q1_maybe values
s_given = Input(shape=(self.inputdims,))
a1_maybe = actor(s_given)
q1_maybe = frozen_critic([s_given,a1_maybe])
# frozen weight version of critic. so we only train the actor
actor_trainer = Model(input=s_given,output=q1_maybe)
# use negative of q1_maybe as loss (so we can maximize q by minimizing the loss)
def neg_q1(y_true,y_pred):
return - y_pred # neat!
actor_trainer.compile(optimizer=self.optimizer,loss=neg_q1)
else: # the actor_trainer is already initialized in create_actor_trainer()
actor_trainer = self.actor_trainer
actor_trainer.fit(s1,
np.zeros((total_size,1)), # useless target label
batch_size=batch_size,
nb_epoch=epochs,
verbose=verbose,
shuffle=False
)
# now both the actor and the critic have improved.
self.replace_weights(tau=0.001 * self.train_skip_every)
else:
pass
# print('# no enough samples, not training')
def feed_one(self,tup):
self.rpm.add(tup)
# gymnastics
def play(self,env,max_steps=-1,realtime=False,noise_level=0.): # play 1 episode
timer = time.time()
max_steps = max_steps if max_steps > 0 else 50000
steps = 0
total_reward = 0
render = self.render
# stack a little history to ensure markov property
# LSTM will definitely be used here in the future...
# global que # python 2 quirk
self.que = np.zeros((self.inputdims,),dtype='float32') # list of recent history actions
def quein(observation):
# global que # python 2 quirk
length = len(observation)
self.que[0:-length] = self.que[length:] # left shift
self.que[-length:] = np.array(observation)
def quecopy():
return self.que.copy()
# what the agent see as state is a stack of history observations.
observation = env.reset()
quein(observation) # quein o1
while True and steps <= max_steps:
steps +=1
thisque = quecopy() # s1
action = self.act(thisque) # a1
if self.is_continuous:
# add noise to our actions, since our policy by nature is deterministic
exploration_noise = self.noise_source.one((self.outputdims,),noise_level)
exploration_noise *= self.action_multiplier
# print(exploration_noise,exploration_noise.shape)
action += exploration_noise
action = self.clamper(action)
action_out = action
else:
# discretize our actions
probabilities = action
csprob = np.cumsum(probabilities)
action_index = (csprob > np.random.rand()).argmax()
action_out = action_index
# o2, r1,
observation, reward, done, _info = env.step(action_out)
# d1
isdone = 1 if done else 0
total_reward += reward
quein(observation) # quein o2
nextque = quecopy() # s2
# feed into replay memory
self.feed_one((thisque,action,reward,isdone,nextque)) # s1,a1,r1,isdone,s2
if render and (steps%10==0 or realtime==True): env.render()
if done :
break
verbose= 2 if steps==1 else 0
self.train(verbose=verbose)
# print('episode done in',steps,'steps',time.time()-timer,'second total reward',total_reward)
totaltime = time.time()-timer
print('episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'.format(
steps,totaltime,totaltime/steps,total_reward
))
return
# one step of action, given observation
def act(self,observation):
actor,critic = self.actor,self.critic
obs = np.reshape(observation,(1,len(observation)))
actions = actor.predict(obs)
q = critic.predict([obs,actions])[0]
disp_actions = (actions[0]-self.action_bias) / self.action_multiplier
disp_actions = disp_actions * 5 + np.arange(self.outputdims) * 12.0 + 30
noise = self.noise_source.ask() * 5 - np.arange(self.outputdims) * 12.0 - 30
self.loggraph(np.hstack([disp_actions,noise,q]))
return actions[0]
def loggraph(self,waves):
pass
# if not hasattr(self,'wavegraph'):
# def rn():
# r = np.random.uniform()
# return 0.2+r*0.4
# colors = []
# for i in range(len(waves)-1):
# color = [rn(),rn(),rn()]
# colors.append(color)
# colors.append([0.2,0.5,0.9])
# self.wavegraph = wavegraph(len(waves),'actions/noises/Q',np.array(colors))
# wg = self.wavegraph
# wg.one(waves.reshape((-1,)))
class playground(object):
def __init__(self,envname):
self.envname=envname
env = gym.make(envname)
self.env = env
self.monpath = './experiment-'+self.envname
def wrap(self):
from gym import wrappers
self.env = wrappers.Monitor(self.env,self.monpath,force=True)
def up(self):
self.env.close()
gym.upload(self.monpath, api_key='sk_ge0PoVXsS6C5ojZ9amTkSA')
# p = playground('LunarLanderContinuous-v2')
p = playground('Pendulum-v0')
# p = playground('MountainCar-v0')BipedalWalker-v2
# p = playground('BipedalWalker-v2')
e = p.env
agent = nnagent(
e.observation_space,
e.action_space,
discount_factor=.99,
stack_factor=1,
optimizer=RMSprop(lr=1e-4),
train_skip_every=50,
)
def r(ep):
agent.render = False
e = p.env
noise_level = 1.
for i in range(ep):
noise_level *= .95
noise_level = max(1e-8,noise_level - 1e-4)
print('ep',i,'/',ep,'noise_level',noise_level)
agent.play(e,max_steps=-1,noise_level=noise_level)
def test():
e = p.env
agent.render = True
agent.play(e,realtime=True,max_steps=-1,noise_level=1e-11)
import numpy as np
class one_fsq_noise(object):
def __init__(self):
self.buffer = np.array([0.])
def one(self,size,noise_level=1.):
# draw one gaussian
g = np.random.normal(loc=0.,scale=noise_level,size=size)
if self.buffer.shape != size:
self.buffer = np.zeros(size,dtype='float32')
self.buffer += g
# high pass a little
self.buffer *= .98
return self.buffer.copy()
def ask(self):
return self.buffer.copy()
# 1/f^2 noise: http://hal.in2p3.fr/in2p3-00024797/document
from collections import deque
import numpy as np
import random
# replay buffer per http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html
class rpm(object):
#replay memory
def __init__(self,buffer_size):
self.buffer_size = buffer_size
self.count = 0
self.buffer = deque()
def add(self, tup):
experience = tup
if self.count < self.buffer_size:
self.buffer.append(experience)
self.count += 1
else:
self.buffer.popleft()
self.buffer.append(experience)
def size(self):
return self.count
def sample_batch(self, batch_size):
'''
batch_size specifies the number of experiences to add
to the batch. If the replay buffer has less than batch_size
elements, simply return all of the elements within the buffer.
Generally, you'll want to wait until the buffer has at least
batch_size elements before beginning to sample from it.
'''
batch = []
if self.count < batch_size:
batch = random.sample(self.buffer, self.count)
else:
batch = random.sample(self.buffer, batch_size)
item_count = len(batch[0])
res = []
for i in range(item_count):
k = np.array([item[i] for item in batch])
if len(k.shape)==1: k = k.reshape(k.shape+(1,))
res.append(k)
return res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment