|
WANDB_ENV_VAR = "WANDB_API_KEY" |
|
|
|
# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_pettingzoo_ma_ataripy |
|
import os |
|
import time |
|
import random |
|
import pickle |
|
import argparse |
|
import itertools |
|
from uuid import uuid4 |
|
from dataclasses import dataclass |
|
from distutils.util import strtobool |
|
from collections import defaultdict, OrderedDict |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
import torch.optim as optim |
|
torch.autograd.set_detect_anomaly(True) |
|
|
|
from rating import OpenSkillRating |
|
|
|
try: |
|
import pyspiel |
|
from open_spiel.python.rl_environment import Environment |
|
except ImportError: |
|
raise Exception("Please install open_spiel python package.") |
|
|
|
import pettingzoo |
|
import supersuit as ss |
|
|
|
from c4.ppo import update_model, eval_agent_against_archive, generate_data |
|
from c4.c4_spiel_wrapper import OpenSpielCompatibleSkipHandleEnv |
|
|
|
|
|
def parse_args(): |
|
# fmt: off |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"), |
|
help="the name of this experiment") |
|
parser.add_argument("--seed", type=int, default=654, |
|
help="seed of the experiment") |
|
parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, |
|
help="if toggled, `torch.backends.cudnn.deterministic=False`") |
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, |
|
help="if toggled, cuda will be enabled by default") |
|
|
|
|
|
# Algorithm specific arguments |
|
parser.add_argument("--env-id", type=str, default="c4", |
|
help="the id of the environment") |
|
parser.add_argument("--total-timesteps", type=int, default=500000, |
|
help="total timesteps of the experiments") |
|
parser.add_argument("--learning-rate", type=float, default=5e-6, |
|
help="the learning rate of the optimizer") |
|
parser.add_argument("--num-envs", type=int, default=1, |
|
help="the number of parallel game environments") |
|
parser.add_argument("--num-steps", type=int, default=64, |
|
help="the number of steps to run in each environment per policy rollout") |
|
parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, |
|
help="Toggle learning rate annealing for policy and value networks") |
|
parser.add_argument("--gamma", type=float, default=0.99, |
|
help="the discount factor gamma") |
|
parser.add_argument("--gae-lambda", type=float, default=0.95, |
|
help="the lambda for the general advantage estimation") |
|
parser.add_argument("--num-minibatches", type=int, default=1, |
|
help="the number of mini-batches") |
|
parser.add_argument("--update-epochs", type=int, default=1, |
|
help="the K epochs to update the policy") |
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, |
|
help="Toggles advantages normalization") |
|
parser.add_argument("--clip-coef", type=float, default=0.5, |
|
help="the surrogate clipping coefficient") |
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, |
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.") |
|
parser.add_argument("--ent-coef", type=float, default=0.01, |
|
help="coefficient of the entropy") |
|
parser.add_argument("--vf-coef", type=float, default=0.01, |
|
help="coefficient of the value function") |
|
parser.add_argument("--max-grad-norm", type=float, default=0.05, |
|
help="the maximum norm for the gradient clipping") |
|
parser.add_argument("--target-kl", type=float, default=None, |
|
help="the target KL divergence threshold") |
|
|
|
# novelty search specific arguments |
|
parser.add_argument("--n_eval_matches", type=int, default=1, help="number of eval matches against each opponent") |
|
|
|
# ranking specific arguments |
|
parser.add_argument("--mu", type=int, default=1000, help="mu for elo ranking") |
|
parser.add_argument("--anchor_mu", type=int, default=1500, help="anchor mu for elo ranking") |
|
parser.add_argument("--sigma", type=float, default=100/3, help="sigma for elo ranking") |
|
|
|
args = parser.parse_args() |
|
args.batch_size = int(args.num_steps * 1) # 15 is game length for i-RPSW |
|
args.minibatch_size = int(args.batch_size // args.num_minibatches) |
|
# fmt: on |
|
return args |
|
|
|
|
|
class InfToIntWrapper(pettingzoo.AECEnv): |
|
def __init__(self, env): |
|
super().__init__() |
|
self.env = env |
|
self.action_space = env.action_space |
|
self.observation_space = env.observation_space |
|
self.agent_iter = env.agent_iter |
|
self.metadata = env.metadata |
|
self.render = env.render |
|
|
|
def step(self, action): |
|
return self.env.step(action) |
|
|
|
def reset(self, **kwargs): |
|
return self.env.reset(**kwargs) |
|
|
|
def last(self, observe: bool = True): |
|
obs, reward, done, trunc, info = self.env.last() |
|
obs[obs == np.inf] = 1 |
|
return obs, reward, done, trunc, info |
|
|
|
|
|
def _get_action_name_from_id(action_id): |
|
action_name = None |
|
if action_id == 0: |
|
action_name = "c0" |
|
elif action_id == 1: |
|
action_name = "c1" |
|
elif action_id == 2: |
|
action_name = "c2" |
|
elif action_id == 3: |
|
action_name = "c3" |
|
elif action_id == 4: |
|
action_name = "c4" |
|
elif action_id == 5: |
|
action_name = "c5" |
|
elif action_id == 6: |
|
action_name = "c6" |
|
return action_name |
|
|
|
|
|
def anneal_lr(optimizer, update, num_updates, learning_rate): |
|
""" |
|
Anneals the learning rate linearly from args.learning_rate down to 0. |
|
""" |
|
frac = 1.0 - (update - 1.0) / num_updates |
|
lrnow = frac * learning_rate |
|
optimizer.param_groups[0]["lr"] = lrnow |
|
|
|
|
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0): |
|
torch.nn.init.orthogonal_(layer.weight, std) |
|
torch.nn.init.constant_(layer.bias, bias_const) |
|
return layer |
|
|
|
|
|
class Agent(nn.Module): |
|
def __init__(self, envs): |
|
super().__init__() |
|
self.critic = nn.Sequential( |
|
layer_init(nn.Linear(envs.single_observation_space.shape[0], 512)), |
|
nn.ReLU(), |
|
layer_init(nn.Linear(512, 256)), |
|
nn.ReLU(), |
|
layer_init(nn.Linear(256, 1), std=1.0), |
|
) |
|
self.actor = nn.Sequential( |
|
layer_init(nn.Linear(envs.single_observation_space.shape[0], 512)), |
|
nn.ReLU(), |
|
layer_init(nn.Linear(512, 256)), |
|
nn.ReLU(), |
|
layer_init(nn.Linear(256, envs.single_action_space.n), std=0.01), |
|
) |
|
|
|
def get_value(self, x): |
|
return self.critic(x) |
|
|
|
def get_action_and_value(self, x, action=None): |
|
if x.ndim == 1: |
|
x = x.unsqueeze(0) |
|
logits = self.actor(x) |
|
probs = torch.distributions.Categorical(logits=logits) |
|
if action is None: |
|
action = probs.sample() |
|
return action, probs.log_prob(action), probs.entropy(), self.critic(x), logits.cpu().detach() |
|
|
|
|
|
if __name__ == "__main__": |
|
import wandb |
|
|
|
args = parse_args() |
|
run_name = f"{args.exp_name}_{args.seed}" |
|
|
|
run = wandb.init(project="mapo", |
|
entity="aadharna", |
|
# sync_tensorboard=True, |
|
config=vars(args), |
|
name=run_name, |
|
# monitor_gym=False, |
|
save_code=True, |
|
) |
|
|
|
time.sleep(4) |
|
random.seed(args.seed) |
|
np.random.seed(args.seed) |
|
torch.manual_seed(args.seed) |
|
torch.backends.cudnn.deterministic = args.torch_deterministic |
|
|
|
# device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") |
|
device = torch.device('cpu') |
|
|
|
env = pyspiel.load_game('connect_four') |
|
env = OpenSpielCompatibleSkipHandleEnv(env) |
|
env = ss.flatten_v0(env) |
|
env = ss.agent_indicator_v0(env, type_only=False) |
|
envs = InfToIntWrapper(env) |
|
envs.single_action_space = env.action_space('player_0') |
|
envs.single_observation_space = env.observation_space('player_0') |
|
envs.reset() |
|
|
|
|
|
agent = Agent(envs).to(device) |
|
greedy_agent = Agent(envs).to(device) |
|
greedy_agent.load_state_dict(agent.state_dict()) |
|
random_agent = Agent(envs).to(device) |
|
random_agent.load_state_dict(agent.state_dict()) |
|
agent_ids = ['main', 'main_v1'] |
|
|
|
novelty_policy_map = {"main": agent.state_dict(), |
|
"main_v1": random_agent.state_dict()} |
|
|
|
rating = OpenSkillRating(args.mu, args.anchor_mu, args.sigma) |
|
rating.add_policy('main') |
|
rating.add_policy('main_v1') |
|
rating.set_anchor(name='main_v1') |
|
|
|
n_opponents = 1 |
|
greedy_n_opponents = 1 |
|
|
|
novelty_archive = OrderedDict() |
|
main_states_by_iter = OrderedDict() |
|
|
|
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) |
|
greedy_optimizer = optim.Adam(greedy_agent.parameters(), lr=args.learning_rate, eps=1e-5) |
|
|
|
win_rate = 0 |
|
greedy_win_rate = 0 |
|
|
|
# TRY NOT TO MODIFY: start the game |
|
global_step = 0 |
|
start_time = time.time() |
|
envs.reset() |
|
observation, _, _, _, _ = env.last() |
|
next_obs = torch.from_numpy(observation).unsqueeze(0).float() |
|
next_done = torch.zeros(args.num_envs).to(device) |
|
num_updates = args.total_timesteps // args.batch_size |
|
|
|
for update in range(1, num_updates + 1): |
|
# Annealing the rate if instructed to do so. |
|
if args.anneal_lr: |
|
anneal_lr(optimizer=optimizer, update=update, num_updates=num_updates, |
|
learning_rate=args.learning_rate) |
|
|
|
(main_batch, opponent_batch, global_step, next_obs, |
|
next_done, batch_reward_main, batch_reward_opp, batch_opponents_scores) = generate_data( |
|
global_step, agent, random_agent, n_opponents, novelty_policy_map, args.num_steps, envs, device) |
|
|
|
storage = {'main': main_batch, 'opponent': opponent_batch, |
|
'next_obs': next_obs, 'next_dones': next_done} |
|
|
|
save_policy_trigger = False |
|
novelty_reward_value = 0 |
|
|
|
win_rate, action_stats = eval_agent_against_archive(agent, "main", random_agent, novelty_policy_map, rating, |
|
args.n_eval_matches, envs, device) |
|
|
|
print(f"Iter={update}/{num_updates}: win-rate={round(win_rate, 4)}") |
|
|
|
loss, entropy_loss, pg_loss, v_loss, explained_var, approx_kl, meanclipfracs, old_approx_kl = update_model( |
|
agent, "main", optimizer, envs, storage, args.num_steps, n_opt_steps=args.update_epochs, |
|
minibatch_size=args.minibatch_size, gamma=args.gamma, gae_lambda=args.gae_lambda, |
|
clip_coef=args.clip_coef, norm_adv=args.norm_adv, clip_vloss=args.clip_vloss, |
|
max_grad_norm=args.max_grad_norm, ent_coef=args.ent_coef, vf_coef=args.vf_coef, target_kl=args.target_kl, |
|
device=device |
|
) |
|
|
|
#### save data to wandb #### |
|
|
|
# TRY NOT TO MODIFY: record rewards for plotting purposes |
|
wandb.log({"charts/learning_rate": optimizer.param_groups[0]['lr']}, step=update) |
|
wandb.log({"losses/value_loss": v_loss}, step=update) |
|
wandb.log({"losses/policy_loss": pg_loss}, update) |
|
wandb.log({"losses/entropy": entropy_loss}, update) |
|
wandb.log({"losses/old_approx_kl": old_approx_kl}, update) |
|
wandb.log({"losses/approx_kl": approx_kl}, update) |
|
wandb.log({"losses/clipfrac": meanclipfracs}, update) |
|
wandb.log({"losses/explained_variance": explained_var}, update) |
|
|
|
# custom saves |
|
wandb.log({"charts/puck/winrate": win_rate}, update) |
|
wandb.log({"charts/puck/league_size": len(novelty_policy_map)}, update) |
|
wandb.log({"charts/puck/novelty_score": novelty_reward_value}, update) |
|
wandb.log({"charts/puck/mc_threshold": args.mc_threshold}, update) |
|
wandb.log({"charts/puck/novelty_threshold": args.novelty_threshold}, update) |
|
|
|
# save agents reward / std |
|
wandb.log({"charts/puck/reward/main_mean": np.mean(batch_reward_main)}, update) |
|
wandb.log({"charts/puck/reward/main_std": np.std(batch_reward_main)}, update) |
|
wandb.log({"charts/puck/reward/main_max": np.max(batch_reward_main)}, update) |
|
wandb.log({"charts/puck/reward/main_min": np.min(batch_reward_main)}, update) |
|
wandb.log({"charts/puck/reward/oppo_mean": np.mean(batch_reward_opp)}, update) |
|
wandb.log({"charts/puck/reward/oppo_std": np.std(batch_reward_opp)}, update) |
|
|
|
# save probabilities for each action |
|
for action_stat, prob in action_stats.items(): |
|
if len(prob) > 1: |
|
continue |
|
else: |
|
prob = prob[0] |
|
wandb.log({f"charts/puck/action_{action_stat}": prob}, step=update) |
|
|
|
# save elo scores for all the agents |
|
r = rating.ratings['main'] |
|
wandb.log({f"charts/elo/puck_v_novelty_archive": int(r.mu)}, step=update) |
|
|
|
# save policies / archives |
|
torch.save(agent.state_dict(), os.path.join(wandb.run.dir, "main_policy.pt")) |
|
with open(os.path.join(wandb.run.dir, "archives.pkl"), "wb") as f: |
|
pickle.dump({ |
|
"novelty_archive": novelty_archive, |
|
"novelty_policy_map": novelty_policy_map, |
|
}, f) |