devforfu/cartpole.py

## cartpole.py
"""
Combination of Sutton's approach dividing space into boxes with simple
TD-learning algorithm (see basic_rl.py somewhere on gym portal).

Some simulation paramteres are hardcoded and learner is not universal.
"""

from collections import defaultdict
import math

import numpy as np
import gym


FIFTY_DEGREES_IN_RADIANS = 0.87266


class CartPoleLearner:

    def __init__(self, env):
        self.env = env

    def encode_state(self, state):
        """
        Converts raw continuous state into one of discreate states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c)

        Args:
            state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot.

        Returns:
            box (int): A discrete state.
        """
        x, x_dot, theta, theta_dot = state
        env = self.env
        x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians

        half_theta_limit = theta_limit/2
        one_twelveth_theta_limit = theta_limit/12
        cart_in_limits = -x_limit < x < x_limit
        pole_in_limits = -theta_limit < theta < theta_limit

        if not cart_in_limits or not pole_in_limits:
            return 0

        box = (1 if x < -0.8 else
               2 if x < 0.8 else
               3)

        if x_dot < -0.5:
            pass
        elif x_dot < 0.5:
            box += 3
        else:
            box += 6

        if theta < -half_theta_limit:
            pass
        elif theta < -one_twelveth_theta_limit:
            box += 9
        elif theta < 0:
            box += 18
        elif theta < one_twelveth_theta_limit:
            box += 27
        elif theta < half_theta_limit:
            box += 36
        else:
            box += 45

        if theta_dot < -FIFTY_DEGREES_IN_RADIANS:
            pass
        elif theta_dot < FIFTY_DEGREES_IN_RADIANS:
            box += 54
        else:
            box += 108

        return box

    def learn(self):
        """
        Solves pole-balancing task using basic version of TD algorithm.
        """

        def epsilon_greedy(state, q_values, eps):
            a = np.argmax(q_values[state, :])
            if np.random.rand() < eps:
                a = np.random.randint(q_values.shape[1])
            return a

        mean, std = 0, 1
        episodes = 5000
        max_step = 1000
        eps = 0.75
        alpha = 0.5
        beta = 0.0
        gamma = 0.95
        eps_decay = 0.995
        ave_cumu_r = None
        env = self.env

        n_s = 163
        n_a = env.action_space.n
        qs = mean + std * np.random.randn(n_s, n_a)

        for episode in range(episodes):
            cumu_r = 0
            curr_s = self.encode_state(env.reset())
            curr_a = epsilon_greedy(curr_s, qs, eps)

            for step in range(max_step):
                # env.render()
                raw_s, r, done, _ = env.step(curr_a)

                # core part
                next_s = self.encode_state(raw_s)
                next_a = epsilon_greedy(next_s, qs, eps)
                delta = r + gamma*qs[next_s, next_a] - qs[curr_s, curr_a]
                qs[curr_s, curr_a] += alpha * delta
                curr_s, curr_a = next_s, next_a

                # track to decrease epsilon
                cumu_r = r + gamma * cumu_r

                if done:
                    kappa = 0.01
                    if ave_cumu_r is None:
                        ave_cumu_r = cumu_r
                    else:
                        ave_cumu_r = kappa*cumu_r + (1 - kappa)*ave_cumu_r
                    print("Episode {} ended on step {} with average cumulative "
                          "reward: {}".format(episode, step, ave_cumu_r))
                    if cumu_r > ave_cumu_r:
                        eps *= eps_decay
                    history.append(ave_cumu_r)
                    break

def main():
    env = gym.make('CartPole-v0')
    learner = CartPoleLearner(env)
    learner.learn()


if __name__ == '__main__':
    main()
	"""
	Combination of Sutton's approach dividing space into boxes with simple
	TD-learning algorithm (see basic_rl.py somewhere on gym portal).

	Some simulation paramteres are hardcoded and learner is not universal.
	"""

	from collections import defaultdict
	import math

	import numpy as np
	import gym


	FIFTY_DEGREES_IN_RADIANS = 0.87266


	class CartPoleLearner:

	def __init__(self, env):
	self.env = env

	def encode_state(self, state):
	"""
	Converts raw continuous state into one of discreate states (see https://webdocs.cs.ualberta.ca/~sutton/book/code/pole.c)

	Args:
	state (list): A raw state, i.e. list of x, x_dot, theta and theta_dot.

	Returns:
	box (int): A discrete state.
	"""
	x, x_dot, theta, theta_dot = state
	env = self.env
	x_limit, theta_limit = env.x_threshold, env.theta_threshold_radians

	half_theta_limit = theta_limit/2
	one_twelveth_theta_limit = theta_limit/12
	cart_in_limits = -x_limit < x < x_limit
	pole_in_limits = -theta_limit < theta < theta_limit

	if not cart_in_limits or not pole_in_limits:
	return 0

	box = (1 if x < -0.8 else
	2 if x < 0.8 else
	3)

	if x_dot < -0.5:
	pass
	elif x_dot < 0.5:
	box += 3
	else:
	box += 6

	if theta < -half_theta_limit:
	pass
	elif theta < -one_twelveth_theta_limit:
	box += 9
	elif theta < 0:
	box += 18
	elif theta < one_twelveth_theta_limit:
	box += 27
	elif theta < half_theta_limit:
	box += 36
	else:
	box += 45

	if theta_dot < -FIFTY_DEGREES_IN_RADIANS:
	pass
	elif theta_dot < FIFTY_DEGREES_IN_RADIANS:
	box += 54
	else:
	box += 108

	return box

	def learn(self):
	"""
	Solves pole-balancing task using basic version of TD algorithm.
	"""

	def epsilon_greedy(state, q_values, eps):
	a = np.argmax(q_values[state, :])
	if np.random.rand() < eps:
	a = np.random.randint(q_values.shape[1])
	return a

	mean, std = 0, 1
	episodes = 5000
	max_step = 1000
	eps = 0.75
	alpha = 0.5
	beta = 0.0
	gamma = 0.95
	eps_decay = 0.995
	ave_cumu_r = None
	env = self.env

	n_s = 163
	n_a = env.action_space.n
	qs = mean + std * np.random.randn(n_s, n_a)

	for episode in range(episodes):
	cumu_r = 0
	curr_s = self.encode_state(env.reset())
	curr_a = epsilon_greedy(curr_s, qs, eps)

	for step in range(max_step):
	# env.render()
	raw_s, r, done, _ = env.step(curr_a)

	# core part
	next_s = self.encode_state(raw_s)
	next_a = epsilon_greedy(next_s, qs, eps)
	delta = r + gamma*qs[next_s, next_a] - qs[curr_s, curr_a]
	qs[curr_s, curr_a] += alpha * delta
	curr_s, curr_a = next_s, next_a

	# track to decrease epsilon
	cumu_r = r + gamma * cumu_r

	if done:
	kappa = 0.01
	if ave_cumu_r is None:
	ave_cumu_r = cumu_r
	else:
	ave_cumu_r = kappacumu_r + (1 - kappa)ave_cumu_r
	print("Episode {} ended on step {} with average cumulative "
	"reward: {}".format(episode, step, ave_cumu_r))
	if cumu_r > ave_cumu_r:
	eps *= eps_decay
	history.append(ave_cumu_r)
	break

	def main():
	env = gym.make('CartPole-v0')
	learner = CartPoleLearner(env)
	learner.learn()


	if __name__ == '__main__':
	main()