iaroslav-ai/README.md

## README.md

      
    Raw
  

              README.md
            
          
    A slightly modified deep Q learning approach is used from this paper. Requires chainer.
To reproduce run code below with python 2.7; It will run training and monitor of the environment. Training data and some videos will be saved in "pendulum" folder near the script file.
Continuous space is discretized with 11 different actions.
It appears that there are some convergence problems; Maybe better selection of parameters would lead to a better objective value.

  
## main.py
import chainer
from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils, flag
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
import numpy as np
import collections


class DeepQModel(Chain):
    def __init__(self , isz, osz):
        super(DeepQModel, self).__init__(
            mid=L.Linear(isz, 64),  # the first linear layer
            out=L.Linear(64, osz),  # the feed-forward output layer
        )

    def reset_state(self):
        self.mid.reset_state()

    def __call__(self, x):
        h = F.relu(self.mid(x))
        y = self.out(h)
        return y

def tv(x, v = flag.OFF):
    return Variable(x.astype('float32'), volatile=v)

class DeepQ_Descrete():
    def __init__(self, isz, n_a):

        self.s_buff = collections.deque( [], 1 ) # num of states to consider as one big MDP state

        self.n_a = n_a  # num of actions
        self.isz = isz  # size of input vector
        self.training = True # is training?

        self.Q = DeepQModel(isz * self.s_buff.maxlen, n_a)
        self.Qp = DeepQModel(isz * self.s_buff.maxlen, n_a)

        self.Qp.copyparams(self.Q)

        self.pr = 0.0

        class MSE(Chain):
            def __init__(self):
                super(MSE, self).__init__()

            def __call__(self, X, Y, A, Q):
                P = Q(X)
                P = F.select_item(P, Variable(np.array(A).astype('int32')))
                return F.mean_squared_error(Y, P)

        if self.training:
            self.d = 0.999  # discount
            self.idx = 0 # steps counter
            self.upd = 0 # updates counter
            self.batch = 16 # batch size
            self.batches = 64 # number of update batches
            self.random_exp = 128.0 # the larger the value the more random exploration will be done

            self.pr = 1.0 # probability of choosing random action

            self.loss = MSE()
            self.opt = optimizers.Adam(alpha=0.001)
            self.opt.setup(self.Q)
            #self.opt.add_hook(chainer.optimizer.GradientClipping(1.0))

            self.Q.zerograds()
            self.r_buff = collections.deque([], 5000) # num of games to keep in replay buffer
            self.r = [] # replay array for 1 game; nice to have it separately, in case of multithreading later

    def reset(self):
        # resets the network.  if training, resets the state buffer

        if self.training:
            # add game array in the replay buffer
            if self.r:
                self.r_buff.append(self.r)
            # empty game replay array
            self.r = []
            # zero input buffer
            for _ in range(self.s_buff.maxlen):
                self.s_buff.append( np.zeros((1, self.isz)) )

    def get_mdp_obs(self, obs):
        # add observation to buffer, concatenate buffer into vector
        self.s_buff.append(obs)
        cc = np.column_stack(self.s_buff)

        return cc

    def next(self, obs):

        # add observation to buffer
        mdp_obs = self.get_mdp_obs(obs)

        x = Variable(mdp_obs.astype('float32'))
        pa = self.Q(x)  # q distribution over actions

        # choose action, with self.pr probability at random
        a = np.argmax(pa.data) if np.random.rand() > self.pr else np.random.randint(0, self.n_a)

        if self.training:
            self.r.append([mdp_obs, a]) # save observation and action
            self.idx += 1

        return a

    def feedback(self, reward):
        # associate feedback with recent action
        self.r[-1].append(reward)

    def train(self, par=None):

        self.pr = min(1.0, 0.02 + self.random_exp / (self.idx + 1.0))
        self.upd = self.upd + 1

        # generate dataset from replay buffer
        if self.upd % 30 == 0:
            self.Qp.copyparams(self.Q)


        # save the buffer if any
        if self.r:
            self.r_buff.append(self.r)
            self.r = []

        R = [sum([r for x, a, r in reversed(game)]) for game in self.r_buff]
        R = zip(xrange(len(R)), R)
        R.sort(key= lambda p: p[1])
        SI = [r[0] for r in R]

        tot = 0

        for repeat in range(self.batches):

            X = []
            A = []
            Y = []

            ln = len(self.r_buff)

            I = np.random.choice(ln, min(ln, self.batch), replace=False)

            XQ = []

            for i in I:
                game = self.r_buff[i]
                for x, a, r in reversed(game):
                    XQ.append(x)

            XQ = tv(np.row_stack(XQ), v=flag.ON)

            Qmax = F.max( self.Qp(XQ), axis=1)
            Qmax = Qmax.data

            idx = 0

            for i in I:
                game = self.r_buff[i]
                q_max = 0.0

                for x, a, r in reversed(game):

                    y = q_max + r

                    X.append(x)
                    Y.append(y)
                    A.append(a)

                    # update q max
                    q_max = self.d * Qmax[idx]
                    idx += 1

            X = tv(np.row_stack(X))
            Y = tv(np.squeeze(np.row_stack(Y)))

            self.Q.zerograds()
            loss = self.loss(X, Y, A, self.Q)

            # update the parameters of the agent
            loss.backward()
            self.opt.update()

            tot += self.batch

            if tot > len(self.r_buff):
                pass

import gym

buff = collections.deque([], 100)
env = gym.make('Pendulum-v0')
env.monitor.start("pendulum-home", force=True)

MAX_STEPS = env.spec.timestep_limit
na = 11
actor = DeepQ_Descrete(env.observation_space.shape[0],na)

for episode in xrange(3000):

    actor.reset()
    observation = env.reset()
    buff.append(0)

    for t in xrange(MAX_STEPS):

        action = actor.next([observation])
        m = (action-na/2)*(4.0 / (na-1))
        observation, reward, done, info = env.step(np.array([m]))
        buff[-1] += reward

        #env.render()

        actor.feedback(reward)

        if done:
            break

    actor.train()
    print buff[-1], "avg. reward:", np.mean(buff), "iter:", episode

env.monitor.close()
	import chainer
	from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils, flag
	from chainer import Link, Chain, ChainList
	import chainer.functions as F
	import chainer.links as L
	import numpy as np
	import collections


	class DeepQModel(Chain):
	def __init__(self , isz, osz):
	super(DeepQModel, self).__init__(
	mid=L.Linear(isz, 64), # the first linear layer
	out=L.Linear(64, osz), # the feed-forward output layer
	)

	def reset_state(self):
	self.mid.reset_state()

	def __call__(self, x):
	h = F.relu(self.mid(x))
	y = self.out(h)
	return y

	def tv(x, v = flag.OFF):
	return Variable(x.astype('float32'), volatile=v)

	class DeepQ_Descrete():
	def __init__(self, isz, n_a):

	self.s_buff = collections.deque( [], 1 ) # num of states to consider as one big MDP state

	self.n_a = n_a # num of actions
	self.isz = isz # size of input vector
	self.training = True # is training?

	self.Q = DeepQModel(isz * self.s_buff.maxlen, n_a)
	self.Qp = DeepQModel(isz * self.s_buff.maxlen, n_a)

	self.Qp.copyparams(self.Q)

	self.pr = 0.0

	class MSE(Chain):
	def __init__(self):
	super(MSE, self).__init__()

	def __call__(self, X, Y, A, Q):
	P = Q(X)
	P = F.select_item(P, Variable(np.array(A).astype('int32')))
	return F.mean_squared_error(Y, P)

	if self.training:
	self.d = 0.999 # discount
	self.idx = 0 # steps counter
	self.upd = 0 # updates counter
	self.batch = 16 # batch size
	self.batches = 64 # number of update batches
	self.random_exp = 128.0 # the larger the value the more random exploration will be done

	self.pr = 1.0 # probability of choosing random action

	self.loss = MSE()
	self.opt = optimizers.Adam(alpha=0.001)
	self.opt.setup(self.Q)
	#self.opt.add_hook(chainer.optimizer.GradientClipping(1.0))

	self.Q.zerograds()
	self.r_buff = collections.deque([], 5000) # num of games to keep in replay buffer
	self.r = [] # replay array for 1 game; nice to have it separately, in case of multithreading later

	def reset(self):
	# resets the network. if training, resets the state buffer

	if self.training:
	# add game array in the replay buffer
	if self.r:
	self.r_buff.append(self.r)
	# empty game replay array
	self.r = []
	# zero input buffer
	for _ in range(self.s_buff.maxlen):
	self.s_buff.append( np.zeros((1, self.isz)) )

	def get_mdp_obs(self, obs):
	# add observation to buffer, concatenate buffer into vector
	self.s_buff.append(obs)
	cc = np.column_stack(self.s_buff)

	return cc

	def next(self, obs):

	# add observation to buffer
	mdp_obs = self.get_mdp_obs(obs)

	x = Variable(mdp_obs.astype('float32'))
	pa = self.Q(x) # q distribution over actions

	# choose action, with self.pr probability at random
	a = np.argmax(pa.data) if np.random.rand() > self.pr else np.random.randint(0, self.n_a)

	if self.training:
	self.r.append([mdp_obs, a]) # save observation and action
	self.idx += 1

	return a

	def feedback(self, reward):
	# associate feedback with recent action
	self.r[-1].append(reward)

	def train(self, par=None):

	self.pr = min(1.0, 0.02 + self.random_exp / (self.idx + 1.0))
	self.upd = self.upd + 1

	# generate dataset from replay buffer
	if self.upd % 30 == 0:
	self.Qp.copyparams(self.Q)


	# save the buffer if any
	if self.r:
	self.r_buff.append(self.r)
	self.r = []

	R = [sum([r for x, a, r in reversed(game)]) for game in self.r_buff]
	R = zip(xrange(len(R)), R)
	R.sort(key= lambda p: p[1])
	SI = [r[0] for r in R]

	tot = 0

	for repeat in range(self.batches):

	X = []
	A = []
	Y = []

	ln = len(self.r_buff)

	I = np.random.choice(ln, min(ln, self.batch), replace=False)

	XQ = []

	for i in I:
	game = self.r_buff[i]
	for x, a, r in reversed(game):
	XQ.append(x)

	XQ = tv(np.row_stack(XQ), v=flag.ON)

	Qmax = F.max( self.Qp(XQ), axis=1)
	Qmax = Qmax.data

	idx = 0

	for i in I:
	game = self.r_buff[i]
	q_max = 0.0

	for x, a, r in reversed(game):

	y = q_max + r

	X.append(x)
	Y.append(y)
	A.append(a)

	# update q max
	q_max = self.d * Qmax[idx]
	idx += 1

	X = tv(np.row_stack(X))
	Y = tv(np.squeeze(np.row_stack(Y)))

	self.Q.zerograds()
	loss = self.loss(X, Y, A, self.Q)

	# update the parameters of the agent
	loss.backward()
	self.opt.update()

	tot += self.batch

	if tot > len(self.r_buff):
	pass

	import gym

	buff = collections.deque([], 100)
	env = gym.make('Pendulum-v0')
	env.monitor.start("pendulum-home", force=True)

	MAX_STEPS = env.spec.timestep_limit
	na = 11
	actor = DeepQ_Descrete(env.observation_space.shape[0],na)

	for episode in xrange(3000):

	actor.reset()
	observation = env.reset()
	buff.append(0)

	for t in xrange(MAX_STEPS):

	action = actor.next([observation])
	m = (action-na/2)*(4.0 / (na-1))
	observation, reward, done, info = env.step(np.array([m]))
	buff[-1] += reward

	#env.render()

	actor.feedback(reward)

	if done:
	break

	actor.train()
	print buff[-1], "avg. reward:", np.mean(buff), "iter:", episode

	env.monitor.close()