Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
"""classic Acrobot task"""
##Updated by Rajiv Shah to get the pendulum to balance for an extended time
from gym import core, spaces
from gym.utils import seeding
import numpy as np
from numpy import sin, cos, pi
import time
__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
__credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
"William Dabney", "Jonathan P. How"]
__license__ = "BSD 3-Clause"
__author__ = "Christoph Dann <cdann@cdann.de>"
# SOURCE:
# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
class AcrobotEnv(core.Env):
"""
Acrobot is a 2-link pendulum with only the second joint actuated
Intitially, both links point downwards. The goal is to swing the
end-effector at a height at least the length of one link above the base.
Both links can swing freely and can pass by each other, i.e., they don't
collide when they have the same angle.
**STATE:**
The state consists of the two rotational joint angles and their velocities
[theta1 theta2 thetaDot1 thetaDot2]. An angle of 0 corresponds to corresponds
to the respective link pointing downwards (angles are in world coordinates).
**ACTIONS:**
The action is either applying +1, 0 or -1 torque on the joint between
the two pendulum links.
.. note::
The dynamics equations were missing some terms in the NIPS paper which
are present in the book. R. Sutton confirmed in personal correspondance
that the experimental results shown in the paper and the book were
generated with the equations shown in the book.
However, there is the option to run the domain with the paper equations
by setting book_or_nips = 'nips'
**REFERENCE:**
.. seealso::
R. Sutton: Generalization in Reinforcement Learning:
Successful Examples Using Sparse Coarse Coding (NIPS 1996)
.. seealso::
R. Sutton and A. G. Barto:
Reinforcement learning: An introduction.
Cambridge: MIT press, 1998.
.. warning::
This version of the domain uses the Runge-Kutta method for integrating
the system dynamics and is more realistic, but also considerably harder
than the original version which employs Euler integration,
see the AcrobotLegacy class.
"""
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second' : 15
}
dt = 0.2
LINK_LENGTH_1 = 1. # [m] #was 1
LINK_LENGTH_2 = 1. # [m]
LINK_MASS_1 = 1. #: [kg] mass of link 1
LINK_MASS_2 = 1. #: [kg] mass of link 2
LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1
LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2
LINK_MOI = 1. #: moments of inertia for both links
MAX_VEL_1 = 4/9 * np.pi ##was 4
MAX_VEL_2 = 9/9 * np.pi #was 9
AVAIL_TORQUE = [-1., 0., +1]
torque_noise_max = 0.
#: use dynamics equations from the nips paper or the book
book_or_nips = "book"
action_arrow = None
domain_fig = None
actions_num = 3
def __init__(self):
self.viewer = None
high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2])
low = -high
self.observation_space = spaces.Box(low, high)
self.action_space = spaces.Discrete(3)
self._seed()
self.steps_beyond_done = 0
self.rewardx = -1
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def _reset(self):
self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,))
return self._get_ob()
def _step(self, a):
s = self.state
torque = self.AVAIL_TORQUE[a]
if self.steps_beyond_done == None:
self.steps_beyond_done == 0
#self.steps_beyond_done == 1
# Add noise to the force action
if self.torque_noise_max > 0:
torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max)
# Now, augment the state with our force action so it can be passed to
# _dsdt
s_augmented = np.append(s, torque)
ns = rk4(self._dsdt, s_augmented, [0, self.dt])
# only care about final timestep of integration returned by integrator
ns = ns[-1]
ns = ns[:4] # omit action
# ODEINT IS TOO SLOW!
# ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt])
# self.s_continuous = ns_continuous[-1] # We only care about the state
# at the ''final timestep'', self.dt
ns[0] = wrap(ns[0], -pi, pi)
ns[1] = wrap(ns[1], -pi, pi)
ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
self.state = ns
#terminal = self._terminal()
#reward = bool(-np.cos(s[0]) - np.cos(s[1] + s[0])
#if reward > 1:
# terminal
self.rewardx = (-np.cos(s[0]) - np.cos(s[1] + s[0])) ##Swung height is calculataed
if self.rewardx < .5:
reward = -1.
self.steps_beyond_done = 0
if (self.rewardx > .5 and self.rewardx < .8):
reward = -0.8
self.steps_beyond_done = 0
if self.rewardx > .8:
reward = -0.6
if self.rewardx < .9:
self.steps_beyond_done = 0
if self.rewardx > 1:
reward = -0.4
#print (self.rewardx)
self.steps_beyond_done += 1
#print (self.steps_beyond_done)
if self.steps_beyond_done > 4:
reward = -0.2
if self.steps_beyond_done > 8:
reward = -0.1
#print (self.rewardx)
#print (self.steps_beyond_done)
if self.steps_beyond_done > 20:
reward = -0.05
if self.steps_beyond_done > 60:
reward = 0.
#print (self.rewardx)
print (self.steps_beyond_done)
terminal = self._terminal()
#reward = -1. if not terminal else 0.
return (self._get_ob(), reward, terminal, {})
def _get_ob(self):
s = self.state
return np.array([cos(s[0]), np.sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]])
def _terminal(self):
#s = self.state
sreward = self.rewardx
sd = self.steps_beyond_done ##last change
#reward1 = (-np.cos(s[0]) - np.cos(s[1] + s[0]))
#if self.steps_beyond_done == 2:
# print ("terminal - 2")
#return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.)
return bool(sd > 60)
def _dsdt(self, s_augmented, t):
m1 = self.LINK_MASS_1
m2 = self.LINK_MASS_2
l1 = self.LINK_LENGTH_1
lc1 = self.LINK_COM_POS_1
lc2 = self.LINK_COM_POS_2
I1 = self.LINK_MOI
I2 = self.LINK_MOI
g = 0 #was 9.8
a = s_augmented[-1]
s = s_augmented[:-1]
theta1 = s[0]
theta2 = s[1]
dtheta1 = s[2]
dtheta2 = s[3]
d1 = m1 * lc1 ** 2 + m2 * \
(l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2)) + I1 + I2
d2 = m2 * (lc2 ** 2 + l1 * lc2 * np.cos(theta2)) + I2
phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.)
phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2) \
- 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) \
+ (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) + phi2
if self.book_or_nips == "nips":
# the following line is consistent with the description in the
# paper
ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \
(m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
else:
# the following line is consistent with the java implementation and the
# book
ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * np.sin(theta2) - phi2) \
/ (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.)
def _render(self, mode='human', close=False):
if close:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return
from gym.envs.classic_control import rendering
s = self.state
if self.viewer is None:
self.viewer = rendering.Viewer(500,500)
self.viewer.set_bounds(-2.2,2.2,-2.2,2.2)
p1 = [-self.LINK_LENGTH_1 *
np.cos(s[0]), self.LINK_LENGTH_1 * np.sin(s[0])]
p2 = [p1[0] - self.LINK_LENGTH_2 * np.cos(s[0] + s[1]),
p1[1] + self.LINK_LENGTH_2 * np.sin(s[0] + s[1])]
xys = np.array([[0,0], p1, p2])[:,::-1]
thetas = [s[0]-np.pi/2, s[0]+s[1]-np.pi/2]
self.viewer.draw_line((-2.2, 0), (2.2, 0))
for ((x,y),th) in zip(xys, thetas):
l,r,t,b = 0, 1, .1, -.1
jtransform = rendering.Transform(rotation=th, translation=(x,y))
link = self.viewer.draw_polygon([(l,b), (l,t), (r,t), (r,b)])
link.add_attr(jtransform)
link.set_color(0,.8, .8)
circ = self.viewer.draw_circle(.1)
circ.set_color(.8, .8, 0)
circ.add_attr(jtransform)
return self.viewer.render(return_rgb_array = mode=='rgb_array')
def wrap(x, m, M):
"""
:param x: a scalar
:param m: minimum possible value in range
:param M: maximum possible value in range
Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
"""
diff = M - m
while x > M:
x = x - diff
while x < m:
x = x + diff
return x
def bound(x, m, M=None):
"""
:param x: scalar
Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
"""
if M is None:
M = m[1]
m = m[0]
# bound x between min (m) and Max (M)
return min(max(x, m), M)
def rk4(derivs, y0, t, *args, **kwargs):
"""
Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
This is a toy implementation which may be useful if you find
yourself stranded on a system w/o scipy. Otherwise use
:func:`scipy.integrate`.
*y0*
initial state vector
*t*
sample times
*derivs*
returns the derivative of the system and has the
signature ``dy = derivs(yi, ti)``
*args*
additional arguments passed to the derivative function
*kwargs*
additional keyword arguments passed to the derivative function
Example 1 ::
## 2D system
def derivs6(x,t):
d1 = x[0] + 2*x[1]
d2 = -3*x[0] + 4*x[1]
return (d1, d2)
dt = 0.0005
t = arange(0.0, 2.0, dt)
y0 = (1,2)
yout = rk4(derivs6, y0, t)
Example 2::
## 1D system
alpha = 2
def derivs(x,t):
return -alpha*x + exp(-t)
y0 = 1
yout = rk4(derivs, y0, t)
If you have access to scipy, you should probably be using the
scipy.integrate tools rather than this function.
"""
try:
Ny = len(y0)
except TypeError:
yout = np.zeros((len(t),), np.float_)
else:
yout = np.zeros((len(t), Ny), np.float_)
yout[0] = y0
i = 0
for i in np.arange(len(t) - 1):
thist = t[i]
dt = t[i + 1] - thist
dt2 = dt / 2.0
y0 = yout[i]
k1 = np.asarray(derivs(y0, thist, *args, **kwargs))
k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs))
k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs))
k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs))
yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
return yout
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment