Skip to content

Instantly share code, notes, and snippets.

@tsvikas
Last active January 18, 2023 23:51
Show Gist options
  • Save tsvikas/c4d0411b8dd821fb9d5eeba678f28d5b to your computer and use it in GitHub Desktop.
Save tsvikas/c4d0411b8dd821fb9d5eeba678f28d5b to your computer and use it in GitHub Desktop.
Gym debugging enviroments from Andy Jones's blog
"""
based on https://andyljones.com/posts/rl-debugging.html
Documentation is quoted from that blogpost.
The usual advice to people writing RL algorithms is to use a simple environment
like the classic control ones from the Gym. Thing is, these envs have the same
problem as looking at loss curves: at best they give you a noisy indicator, and
if the noisy indicator looks poor you don't know why it looks poor. They don't
localise errors.
Instead, construct environments that do localise errors.
"""
from abc import ABC
import gymnasium as gym
import numpy as np
class BasicEnv(gym.Env, ABC):
"""
Base class of a Basic Env with discrete action space, and a 1d-numpy
discrete observation space
The default observation is random, and there is no default reward.
After reset(), info["history"] will have the full history of the episode.
Tracks useful properties:
self.timestep, self.observation_history, self.action_history,
self.reward_history, self.terminated
"""
def __init__(
self,
n_actions=1,
n_observations=1,
episode_len=1,
obs_dtype=np.float32,
reward_type=float,
):
super().__init__()
self.action_space = gym.spaces.Discrete(n_actions)
# we use MultiDiscrete with size 1 to get a np.array with size 1
self.observation_space = gym.spaces.MultiDiscrete([n_observations])
self.n_actions = n_actions
self.n_observations = n_observations
self.episode_len = episode_len
self.obs_dtype = obs_dtype
self.reward_type = reward_type
@property
def possible_observations(self):
"""List of all possible observations"""
return [
np.array([i]).astype(self.obs_dtype) for i in range(self.n_observations)
]
def _get_obs(self, is_reset: bool):
return self.observation_space.sample().astype(self.obs_dtype)
def _get_reward(self, action):
raise NotImplementedError
def _is_terminated(self):
return self.timestep == self.episode_len
def _get_info(self):
return {}
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.timestep = 0
observation = self._get_obs(is_reset=True)
info = self._get_info()
info["history"] = {
"observation": self.observation_history,
"action": self.action_history,
"reward": self.reward_history,
}
self.observation_history = [observation]
self.action_history = []
self.reward_history = []
self.terminated = False
return observation, info
def step(self, action):
if self.terminated:
raise RuntimeError("don't step a terminated enviroment")
self.timestep += 1
observation = self._get_obs(is_reset=False)
reward = self._get_reward(action)
terminated = self._is_terminated()
info = self._get_info()
self.observation_history.append(observation)
self.action_history.append(action)
self.reward_history.append(reward)
self.terminated = terminated
return observation, reward, terminated, False, info
def render(self):
pass
class ConstRewardEnv(BasicEnv):
"""
+1 reward every time
This isolates the value network.
If my agent can't learn that the value of the only observation it ever sees
is 1, there's a problem with the value loss calculation or the optimizer
Expected value (obs -> value for each action):
[0] -> [1.0]
"""
version = 0
def __init__(self, n_actions=1, n_observations=1, episode_len=1):
super().__init__(
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
)
def _get_reward(self, action):
return self.reward_type(1.0)
class ObservedRewardEnv(BasicEnv):
"""
obs-dependent reward every time
If my agent can learn the value in ConstRewardEnv but not this one, meaning
it can learn a constant reward but not a predictable one, it must be that
backpropagation through my network is broken.
Expected value (obs -> value for each action):
[0] -> [0.0]
[1] -> [1.0]
"""
version = 0
def __init__(self, n_actions=1, n_observations=2, episode_len=1):
super().__init__(
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
)
def _get_reward(self, action):
return self.reward_type(self.observation_history[-1] != 0)
class FutureRewardEnv(BasicEnv):
"""
two timesteps long, +1 reward at the end
If my agent can learn the value in ObservedRewardEnv but not this one, it
must be that my reward discounting is broken.
Expected value (obs -> value for each action):
[0] -> [1.0 * discount_rate]
[1] -> [1.0]
"""
version = 0
def __init__(self, n_actions=1, episode_len=2):
super().__init__(
n_actions=n_actions, n_observations=episode_len, episode_len=episode_len
)
def _get_obs(self, is_reset: bool):
return np.array([self.timestep], dtype=self.obs_dtype)
def _get_reward(self, action):
return self.reward_type(self._is_terminated())
class ActionRewardEnv(BasicEnv):
"""
action-dependent reward
The first env to exercise the policy. If my agent can't learn to pick the
better action, there's something wrong with either my advantage
calculations, my policy loss or my policy update.
That's three things, but it's easy to work out by hand the expected values
for each one and check that the values produced by your actual code line up
with them.
Expected value (obs -> value for each action):
[0] -> [0.0, 1.0]
"""
version = 0
def __init__(self, n_actions=2, n_observations=1, episode_len=1):
super().__init__(
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
)
def _get_reward(self, action):
return self.reward_type(action != 0)
class ActionObservationRewardEnv(BasicEnv):
"""
action-and-obs dependent reward
Now we've got a dependence on both obs and action. The policy and value
networks interact here, so there's a couple of things to verify:
that the policy network learns to pick the right action in each of the two
states, and that the value network learns that the value of each state
is +1.
If everything's worked up until now, then if - for example - the value
network fails to learn here, it likely means your batching process is
feeding the value network stale experience.
Expected value (obs -> value for each action):
[0] -> [0.0, 1.0]
[1] -> [1.0, 0.0]
"""
version = 0
def __init__(self, n_actions=2, n_observations=2, episode_len=1):
super().__init__(
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len
)
def _get_reward(self, action):
return self.reward_type(action != self.observation_history[-1])
for env in [
ConstRewardEnv,
ObservedRewardEnv,
FutureRewardEnv,
ActionRewardEnv,
ActionObservationRewardEnv,
]:
gym.register(id=f"gym_probes/{env.__name__}-v{env.version}", entry_point=env)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment