tsvikas/rl_probe_enviroments.py

## rl_probe_enviroments.py
"""
based on https://andyljones.com/posts/rl-debugging.html
Documentation is quoted from that blogpost.

The usual advice to people writing RL algorithms is to use a simple environment
like the classic control ones from the Gym. Thing is, these envs have the same
problem as looking at loss curves: at best they give you a noisy indicator, and
if the noisy indicator looks poor you don't know why it looks poor. They don't
localise errors.

Instead, construct environments that do localise errors.
"""
from abc import ABC

import gymnasium as gym
import numpy as np


class BasicEnv(gym.Env, ABC):
    """
    Base class of a Basic Env with discrete action space, and a 1d-numpy
    discrete observation space

    The default observation is random, and there is no default reward.
    After reset(), info["history"] will have the full history of the episode.

    Tracks useful properties:
    self.timestep, self.observation_history, self.action_history,
    self.reward_history, self.terminated
    """

    def __init__(
        self,
        n_actions=1,
        n_observations=1,
        episode_len=1,
        obs_dtype=np.float32,
        reward_type=float,
    ):
        super().__init__()
        self.action_space = gym.spaces.Discrete(n_actions)
        # we use MultiDiscrete with size 1 to get a np.array with size 1
        self.observation_space = gym.spaces.MultiDiscrete([n_observations])
        self.n_actions = n_actions
        self.n_observations = n_observations
        self.episode_len = episode_len
        self.obs_dtype = obs_dtype
        self.reward_type = reward_type

    @property
    def possible_observations(self):
        """List of all possible observations"""
        return [
            np.array([i]).astype(self.obs_dtype) for i in range(self.n_observations)
        ]

    def _get_obs(self, is_reset: bool):
        return self.observation_space.sample().astype(self.obs_dtype)

    def _get_reward(self, action):
        raise NotImplementedError

    def _is_terminated(self):
        return self.timestep == self.episode_len

    def _get_info(self):
        return {}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.timestep = 0
        observation = self._get_obs(is_reset=True)
        info = self._get_info()
        info["history"] = {
            "observation": self.observation_history,
            "action": self.action_history,
            "reward": self.reward_history,
        }
        self.observation_history = [observation]
        self.action_history = []
        self.reward_history = []
        self.terminated = False
        return observation, info

    def step(self, action):
        if self.terminated:
            raise RuntimeError("don't step a terminated enviroment")
        self.timestep += 1
        observation = self._get_obs(is_reset=False)
        reward = self._get_reward(action)
        terminated = self._is_terminated()
        info = self._get_info()
        self.observation_history.append(observation)
        self.action_history.append(action)
        self.reward_history.append(reward)
        self.terminated = terminated
        return observation, reward, terminated, False, info

    def render(self):
        pass


class ConstRewardEnv(BasicEnv):
    """
    +1 reward every time

    This isolates the value network.
    If my agent can't learn that the value of the only observation it ever sees
    is 1, there's a problem with the value loss calculation or the optimizer

    Expected value (obs -> value for each action):
    [0] -> [1.0]
    """

    version = 0

    def __init__(self, n_actions=1, n_observations=1, episode_len=1):
        super().__init__(
            n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
        )

    def _get_reward(self, action):
        return self.reward_type(1.0)


class ObservedRewardEnv(BasicEnv):
    """
    obs-dependent reward every time

    If my agent can learn the value in ConstRewardEnv but not this one, meaning
    it can learn a constant reward but not a predictable one, it must be that
    backpropagation through my network is broken.

    Expected value (obs -> value for each action):
    [0] -> [0.0]
    [1] -> [1.0]
    """

    version = 0

    def __init__(self, n_actions=1, n_observations=2, episode_len=1):
        super().__init__(
            n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
        )

    def _get_reward(self, action):
        return self.reward_type(self.observation_history[-1] != 0)


class FutureRewardEnv(BasicEnv):
    """
    two timesteps long, +1 reward at the end

    If my agent can learn the value in ObservedRewardEnv but not this one, it
    must be that my reward discounting is broken.

    Expected value (obs -> value for each action):
    [0] -> [1.0 * discount_rate]
    [1] -> [1.0]
    """

    version = 0

    def __init__(self, n_actions=1, episode_len=2):
        super().__init__(
            n_actions=n_actions, n_observations=episode_len, episode_len=episode_len
        )

    def _get_obs(self, is_reset: bool):
        return np.array([self.timestep], dtype=self.obs_dtype)

    def _get_reward(self, action):
        return self.reward_type(self._is_terminated())


class ActionRewardEnv(BasicEnv):
    """
    action-dependent reward

    The first env to exercise the policy. If my agent can't learn to pick the
    better action, there's something wrong with either my advantage
    calculations, my policy loss or my policy update.
    That's three things, but it's easy to work out by hand the expected values
    for each one and check that the values produced by your actual code line up
    with them.

    Expected value (obs -> value for each action):
    [0] -> [0.0, 1.0]
    """

    version = 0

    def __init__(self, n_actions=2, n_observations=1, episode_len=1):
        super().__init__(
            n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
        )

    def _get_reward(self, action):
        return self.reward_type(action != 0)


class ActionObservationRewardEnv(BasicEnv):
    """
    action-and-obs dependent reward

    Now we've got a dependence on both obs and action. The policy and value
    networks interact here, so there's a couple of things to verify:
    that the policy network learns to pick the right action in each of the two
    states, and that the value network learns that the value of each state
    is +1.
    If everything's worked up until now, then if - for example - the value
    network fails to learn here, it likely means your batching process is
    feeding the value network stale experience.

    Expected value (obs -> value for each action):
    [0] -> [0.0, 1.0]
    [1] -> [1.0, 0.0]
    """

    version = 0

    def __init__(self, n_actions=2, n_observations=2, episode_len=1):
        super().__init__(
            n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
        )

    def _get_reward(self, action):
        return self.reward_type(action != self.observation_history[-1])


for env in [
    ConstRewardEnv,
    ObservedRewardEnv,
    FutureRewardEnv,
    ActionRewardEnv,
    ActionObservationRewardEnv,
]:
    gym.register(id=f"gym_probes/{env.__name__}-v{env.version}", entry_point=env)
	"""
	based on https://andyljones.com/posts/rl-debugging.html
	Documentation is quoted from that blogpost.

	The usual advice to people writing RL algorithms is to use a simple environment
	like the classic control ones from the Gym. Thing is, these envs have the same
	problem as looking at loss curves: at best they give you a noisy indicator, and
	if the noisy indicator looks poor you don't know why it looks poor. They don't
	localise errors.

	Instead, construct environments that do localise errors.
	"""
	from abc import ABC

	import gymnasium as gym
	import numpy as np


	class BasicEnv(gym.Env, ABC):
	"""
	Base class of a Basic Env with discrete action space, and a 1d-numpy
	discrete observation space

	The default observation is random, and there is no default reward.
	After reset(), info["history"] will have the full history of the episode.

	Tracks useful properties:
	self.timestep, self.observation_history, self.action_history,
	self.reward_history, self.terminated
	"""

	def __init__(
	self,
	n_actions=1,
	n_observations=1,
	episode_len=1,
	obs_dtype=np.float32,
	reward_type=float,
	):
	super().__init__()
	self.action_space = gym.spaces.Discrete(n_actions)
	# we use MultiDiscrete with size 1 to get a np.array with size 1
	self.observation_space = gym.spaces.MultiDiscrete([n_observations])
	self.n_actions = n_actions
	self.n_observations = n_observations
	self.episode_len = episode_len
	self.obs_dtype = obs_dtype
	self.reward_type = reward_type

	@property
	def possible_observations(self):
	"""List of all possible observations"""
	return [
	np.array([i]).astype(self.obs_dtype) for i in range(self.n_observations)
	]

	def _get_obs(self, is_reset: bool):
	return self.observation_space.sample().astype(self.obs_dtype)

	def _get_reward(self, action):
	raise NotImplementedError

	def _is_terminated(self):
	return self.timestep == self.episode_len

	def _get_info(self):
	return {}

	def reset(self, seed=None, options=None):
	super().reset(seed=seed)
	self.timestep = 0
	observation = self._get_obs(is_reset=True)
	info = self._get_info()
	info["history"] = {
	"observation": self.observation_history,
	"action": self.action_history,
	"reward": self.reward_history,
	}
	self.observation_history = [observation]
	self.action_history = []
	self.reward_history = []
	self.terminated = False
	return observation, info

	def step(self, action):
	if self.terminated:
	raise RuntimeError("don't step a terminated enviroment")
	self.timestep += 1
	observation = self._get_obs(is_reset=False)
	reward = self._get_reward(action)
	terminated = self._is_terminated()
	info = self._get_info()
	self.observation_history.append(observation)
	self.action_history.append(action)
	self.reward_history.append(reward)
	self.terminated = terminated
	return observation, reward, terminated, False, info

	def render(self):
	pass


	class ConstRewardEnv(BasicEnv):
	"""
	+1 reward every time

	This isolates the value network.
	If my agent can't learn that the value of the only observation it ever sees
	is 1, there's a problem with the value loss calculation or the optimizer

	Expected value (obs -> value for each action):
	[0] -> [1.0]
	"""

	version = 0

	def __init__(self, n_actions=1, n_observations=1, episode_len=1):
	super().__init__(
	n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
	)

	def _get_reward(self, action):
	return self.reward_type(1.0)


	class ObservedRewardEnv(BasicEnv):
	"""
	obs-dependent reward every time

	If my agent can learn the value in ConstRewardEnv but not this one, meaning
	it can learn a constant reward but not a predictable one, it must be that
	backpropagation through my network is broken.

	Expected value (obs -> value for each action):
	[0] -> [0.0]
	[1] -> [1.0]
	"""

	version = 0

	def __init__(self, n_actions=1, n_observations=2, episode_len=1):
	super().__init__(
	n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
	)

	def _get_reward(self, action):
	return self.reward_type(self.observation_history[-1] != 0)


	class FutureRewardEnv(BasicEnv):
	"""
	two timesteps long, +1 reward at the end

	If my agent can learn the value in ObservedRewardEnv but not this one, it
	must be that my reward discounting is broken.

	Expected value (obs -> value for each action):
	[0] -> [1.0 * discount_rate]
	[1] -> [1.0]
	"""

	version = 0

	def __init__(self, n_actions=1, episode_len=2):
	super().__init__(
	n_actions=n_actions, n_observations=episode_len, episode_len=episode_len
	)

	def _get_obs(self, is_reset: bool):
	return np.array([self.timestep], dtype=self.obs_dtype)

	def _get_reward(self, action):
	return self.reward_type(self._is_terminated())


	class ActionRewardEnv(BasicEnv):
	"""
	action-dependent reward

	The first env to exercise the policy. If my agent can't learn to pick the
	better action, there's something wrong with either my advantage
	calculations, my policy loss or my policy update.
	That's three things, but it's easy to work out by hand the expected values
	for each one and check that the values produced by your actual code line up
	with them.

	Expected value (obs -> value for each action):
	[0] -> [0.0, 1.0]
	"""

	version = 0

	def __init__(self, n_actions=2, n_observations=1, episode_len=1):
	super().__init__(
	n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
	)

	def _get_reward(self, action):
	return self.reward_type(action != 0)


	class ActionObservationRewardEnv(BasicEnv):
	"""
	action-and-obs dependent reward

	Now we've got a dependence on both obs and action. The policy and value
	networks interact here, so there's a couple of things to verify:
	that the policy network learns to pick the right action in each of the two
	states, and that the value network learns that the value of each state
	is +1.
	If everything's worked up until now, then if - for example - the value
	network fails to learn here, it likely means your batching process is
	feeding the value network stale experience.

	Expected value (obs -> value for each action):
	[0] -> [0.0, 1.0]
	[1] -> [1.0, 0.0]
	"""

	version = 0

	def __init__(self, n_actions=2, n_observations=2, episode_len=1):
	super().__init__(
	n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
	)

	def _get_reward(self, action):
	return self.reward_type(action != self.observation_history[-1])


	for env in [
	ConstRewardEnv,
	ObservedRewardEnv,
	FutureRewardEnv,
	ActionRewardEnv,
	ActionObservationRewardEnv,
	]:
	gym.register(id=f"gym_probes/{env.__name__}-v{env.version}", entry_point=env)