rlan/simple_corridor.py

## simple_corridor.py
"""
Reference: https://github.com/ray-project/ray/blob/f8a91c7fad248b1c7f81fd6d30191ac930a92bc4/rllib/examples/env/simple_corridor.py

Fixes:
ValueError: ('Observation ({}) outside given space ({})!', array([0.]), Box([0.], [999.], (1,), float32))
"""

import gym
from gym.spaces import Box, Discrete
import numpy as np


class SimpleCorridor(gym.Env):
    """Example of a custom env in which you have to walk down a corridor.

    You can configure the length of the corridor via the env config."""

    def __init__(self, config=None):
        config = config or {}
        self.end_pos = config.get("corridor_length", 10)
        self.start_pos = config.get("corridor_start", 0)
        self.cur_pos = self.start_pos
        self.action_space = Discrete(2)
        self.observation_space = Box(self.start_pos, self.end_pos, shape=(1, ), dtype=np.float32)

    def set_corridor_length(self, length):
        self.end_pos = length
        self.observation_space = Box(self.start_pos, self.end_pos, shape=(1, ), dtype=np.float32)
        print("Updated corridor length to {}".format(length))

    def reset(self):
        self.cur_pos = self.start_pos
        return np.full(self.observation_space.shape, self.cur_pos, dtype=np.float32)

    def step(self, action):
        assert action in [0, 1], action
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1.0
            self.cur_pos = max(self.cur_pos, self.start_pos)
        elif action == 1:
            self.cur_pos += 1.0
            self.cur_pos = min(self.cur_pos, self.end_pos)
        done = self.cur_pos >= self.end_pos
        obs = np.full(self.observation_space.shape, self.cur_pos, dtype=np.float32)
        return obs, 1 if done else 0, done, {}

## train_eval.py
import ray
from simple_corridor import SimpleCorridor
import ray.rllib.agents.ppo as ppo

config={
  "env": SimpleCorridor,
  "env_config": {
    "corridor_length": 5,
  }
}

stop = {
  "training_iteration": 3,
}

ray.init()

# Train
results = ray.tune.run(
  "PPO",
  config=config,
  stop=stop,
  checkpoint_at_end = True,
)

# Get path to the trained model
results.default_metric = 'episode_reward_mean'
results.default_mode = 'max'
checkpoint_path = results.best_checkpoint
print(checkpoint_path)

# Load the trained model
agent = ppo.PPOTrainer(
  config=config,
  env=SimpleCorridor,
  )
agent.restore(checkpoint_path)

# Run the trained model
env = SimpleCorridor(config=config["env_config"])
done = False
obs = env.reset()
step = 0
episode_reward = 0
while not done:
  action = agent.compute_action(obs, explore=False)
  obs, reward, done, info = env.step(action)
  episode_reward += reward
  print(step, action, obs, reward, done, info)
  step += 1
print(episode_reward)
	"""
	Reference: https://github.com/ray-project/ray/blob/f8a91c7fad248b1c7f81fd6d30191ac930a92bc4/rllib/examples/env/simple_corridor.py

	Fixes:
	ValueError: ('Observation ({}) outside given space ({})!', array([0.]), Box([0.], [999.], (1,), float32))
	"""

	import gym
	from gym.spaces import Box, Discrete
	import numpy as np


	class SimpleCorridor(gym.Env):
	"""Example of a custom env in which you have to walk down a corridor.

	You can configure the length of the corridor via the env config."""

	def __init__(self, config=None):
	config = config or {}
	self.end_pos = config.get("corridor_length", 10)
	self.start_pos = config.get("corridor_start", 0)
	self.cur_pos = self.start_pos
	self.action_space = Discrete(2)
	self.observation_space = Box(self.start_pos, self.end_pos, shape=(1, ), dtype=np.float32)

	def set_corridor_length(self, length):
	self.end_pos = length
	self.observation_space = Box(self.start_pos, self.end_pos, shape=(1, ), dtype=np.float32)
	print("Updated corridor length to {}".format(length))

	def reset(self):
	self.cur_pos = self.start_pos
	return np.full(self.observation_space.shape, self.cur_pos, dtype=np.float32)

	def step(self, action):
	assert action in [0, 1], action
	if action == 0 and self.cur_pos > 0:
	self.cur_pos -= 1.0
	self.cur_pos = max(self.cur_pos, self.start_pos)
	elif action == 1:
	self.cur_pos += 1.0
	self.cur_pos = min(self.cur_pos, self.end_pos)
	done = self.cur_pos >= self.end_pos
	obs = np.full(self.observation_space.shape, self.cur_pos, dtype=np.float32)
	return obs, 1 if done else 0, done, {}
	import ray
	from simple_corridor import SimpleCorridor
	import ray.rllib.agents.ppo as ppo

	config={
	"env": SimpleCorridor,
	"env_config": {
	"corridor_length": 5,
	}
	}

	stop = {
	"training_iteration": 3,
	}

	ray.init()

	# Train
	results = ray.tune.run(
	"PPO",
	config=config,
	stop=stop,
	checkpoint_at_end = True,
	)

	# Get path to the trained model
	results.default_metric = 'episode_reward_mean'
	results.default_mode = 'max'
	checkpoint_path = results.best_checkpoint
	print(checkpoint_path)

	# Load the trained model
	agent = ppo.PPOTrainer(
	config=config,
	env=SimpleCorridor,
	)
	agent.restore(checkpoint_path)

	# Run the trained model
	env = SimpleCorridor(config=config["env_config"])
	done = False
	obs = env.reset()
	step = 0
	episode_reward = 0
	while not done:
	action = agent.compute_action(obs, explore=False)
	obs, reward, done, info = env.step(action)
	episode_reward += reward
	print(step, action, obs, reward, done, info)
	step += 1
	print(episode_reward)