KristianHolsheimer/cartpole_experiment.py

## cartpole_experiment.py
#/usr/bin/env python2.7

import numpy as np
import gym


def value(observation):
    """
    This value function is just the (negative) Lagrangian:

        value(a, ȧ) = -L(u, u̇, a, ȧ)

    Here, 'a' is our angular coordinate and 'u' is the longitudinal
    coordinate, i.e.

        (u, u̇, a, ȧ) = observation

    The Lagrangian can be approximated by a harmonic oscillator:

        L(u, u̇, a, ȧ) ≈ ȧ^2 - m * a^2

    The parameter 'm' must be positive and large enough to confine the
    dynamics to small values for 'ȧ' and 'a'. In other words, the hamonic well
    must be sufficiently steep. The precise value doesn't matter too much,
    as in, it doesn't require fine-tuning. We'll pick m = 1000.

    We can make this simplifying assumption for the following two reasons:

    1)  The angular potential is approximated by:

            V(a) ∝ -cos(a) = -1 + 1/2 * a^2 + O(a^4)

        Also, we ignore any additive constant in L(u, u̇, a, ȧ), because they
        don't contribute to the dynamics.

    2)  We could (or should) really add a potential term for u, e.g. u^2,
        which would prevent the cart from moving out of the screen. As it turns
        out, though, this doesn't happen because the average movement in the
        longitudinal direction (i.e. average u̇) remains small if the action
        0 (move left) and 1 (move right) are more or less balanced.

    """
    u, u_dot, a, a_dot = observation
    return -a_dot ** 2 + 1000 * a ** 2


def predict_new_observation(observation, action):
    """
    This is how we model the transition to the new state, i.e.

        (state, action) ↦ state

    We assume that we take time steps that are small enough such
    that it suffices to model the transitions linearly, i.e. let

        shift = state_new - state_old

    then we assume that 'shift' is well approximated by:

        shift ≈ w * action + b

    Returns the new (shifted) state.

    """
    action = 2 * action - 1  # center around the origin
    shift = w * action + b
    return observation + shift


def Q(observation, action):
    observation = predict_new_observation(observation, action)
    return value(observation)


# learning rate
alpha = 0.5
alpha_annealing = 0.9

# initialize transition-model weights
w = np.random.normal(scale=1e-4, size=4)
b = np.random.normal(scale=1e-4, size=4)

# number of time steps and episodes
t_max = 200
n_episodes = 200

# create environment
env = gym.make('CartPole-v0')
env = gym.wrappers.Monitor(env, 'experiments/CartPole-v0', force=True)


for episode in range(1, n_episodes + 1):

    # initialize environment
    observation = env.reset()
    action = env.action_space.sample()

    for t in range(1, t_max + 1):

        # pick optimal action
        action = np.argmax([Q(observation, 0), Q(observation, 1)])

        # before we get the new state observation, let's predict it first
        observation_predicted = predict_new_observation(observation, action)

        # renew state observation
        observation, _, done, _ = env.step(action)

        # update transition-model weights (simple SGD)
        error = observation_predicted - observation
        alpha *= alpha_annealing
        w -= alpha * error * (2 * action - 1)
        b -= alpha * error

        if done:
            if t < 200:
                raise RuntimeError("Failed after {:d} time steps.".format(t))
            break

# wrap up
env.close()
	#/usr/bin/env python2.7

	import numpy as np
	import gym


	def value(observation):
	"""
	This value function is just the (negative) Lagrangian:

	value(a, ȧ) = -L(u, u̇, a, ȧ)

	Here, 'a' is our angular coordinate and 'u' is the longitudinal
	coordinate, i.e.

	(u, u̇, a, ȧ) = observation

	The Lagrangian can be approximated by a harmonic oscillator:

	L(u, u̇, a, ȧ) ≈ ȧ^2 - m * a^2

	The parameter 'm' must be positive and large enough to confine the
	dynamics to small values for 'ȧ' and 'a'. In other words, the hamonic well
	must be sufficiently steep. The precise value doesn't matter too much,
	as in, it doesn't require fine-tuning. We'll pick m = 1000.

	We can make this simplifying assumption for the following two reasons:

	1) The angular potential is approximated by:

	V(a) ∝ -cos(a) = -1 + 1/2 * a^2 + O(a^4)

	Also, we ignore any additive constant in L(u, u̇, a, ȧ), because they
	don't contribute to the dynamics.

	2) We could (or should) really add a potential term for u, e.g. u^2,
	which would prevent the cart from moving out of the screen. As it turns
	out, though, this doesn't happen because the average movement in the
	longitudinal direction (i.e. average u̇) remains small if the action
	0 (move left) and 1 (move right) are more or less balanced.

	"""
	u, u_dot, a, a_dot = observation
	return -a_dot ** 2 + 1000 * a ** 2


	def predict_new_observation(observation, action):
	"""
	This is how we model the transition to the new state, i.e.

	(state, action) ↦ state

	We assume that we take time steps that are small enough such
	that it suffices to model the transitions linearly, i.e. let

	shift = state_new - state_old

	then we assume that 'shift' is well approximated by:

	shift ≈ w * action + b

	Returns the new (shifted) state.

	"""
	action = 2 * action - 1 # center around the origin
	shift = w * action + b
	return observation + shift


	def Q(observation, action):
	observation = predict_new_observation(observation, action)
	return value(observation)


	# learning rate
	alpha = 0.5
	alpha_annealing = 0.9

	# initialize transition-model weights
	w = np.random.normal(scale=1e-4, size=4)
	b = np.random.normal(scale=1e-4, size=4)

	# number of time steps and episodes
	t_max = 200
	n_episodes = 200

	# create environment
	env = gym.make('CartPole-v0')
	env = gym.wrappers.Monitor(env, 'experiments/CartPole-v0', force=True)


	for episode in range(1, n_episodes + 1):

	# initialize environment
	observation = env.reset()
	action = env.action_space.sample()

	for t in range(1, t_max + 1):

	# pick optimal action
	action = np.argmax([Q(observation, 0), Q(observation, 1)])

	# before we get the new state observation, let's predict it first
	observation_predicted = predict_new_observation(observation, action)

	# renew state observation
	observation, _, done, _ = env.step(action)

	# update transition-model weights (simple SGD)
	error = observation_predicted - observation
	alpha *= alpha_annealing
	w -= alpha * error * (2 * action - 1)
	b -= alpha * error

	if done:
	if t < 200:
	raise RuntimeError("Failed after {:d} time steps.".format(t))
	break

	# wrap up
	env.close()