"cell_type": "code",
"source": [
"import gym\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import pandas as pd\n",
"import imageio\n",
"import time\n",
"import numpy as np\n",
"import gym\n",
"from stable_baselines.common.vec_env import DummyVecEnv, VecVideoRecorder, SubprocVecEnv\n",
"from stable_baselines.ddpg.policies import CnnPolicy, MlpPolicy\n",
"from stable_baselines.common.policies import MlpLstmPolicy, CnnLstmPolicy, MlpPolicy\n",
"from stable_baselines import A2C, PPO2, SAC, TD3, TRPO, DDPG, ACER, ACKTR, SAC\n",
"from stable_baselines.common.evaluation import evaluate_policy\n",
"from stable_baselines.common import set_global_seeds\n",
"from stable_baselines.bench import Monitor\n",
"from stable_baselines.results_plotter import load_results, ts2xy"
"cell_type": "code",
"source": [
"cell_type": "code",
"source": [
"def make_env(env_id, rank, seed=0):\n",
" \"\"\"\n",
" Utility function for multiprocessed env.\n",
" \n",
" :param env_id: (str) the environment ID\n",
" :param num_env: (int) the number of environment you wish to have in subprocesses\n",
" :param seed: (int) the inital seed for RNG\n",
" :param rank: (int) index of the subprocess\n",
" \"\"\"\n",
" def _init():\n",
" env = gym.make(env_id)\n",
" env.seed(seed + rank)\n",
" return env\n",
" set_global_seeds(seed)\n",
" return _init\n",
"def evaluate(model, num_steps=1000):\n",
" \"\"\"\n",
" Evaluate a RL agent\n",
" :param model: (BaseRLModel object) the RL Agent\n",
" :param num_steps: (int) number of timesteps to evaluate it\n",
" :return: (float) Mean reward\n",
" \"\"\"\n",
" \n",
" episode_rewards = [[0.0] for _ in range(env.num_envs)]\n",
" obs = env.reset()\n",
" for i in range(num_steps):\n",
" # _states are only useful when using LSTM policies\n",
" actions, _states = model.predict(obs)\n",
" # here, action, rewards and dones are arrays\n",
" # because we are using vectorized env\n",
" obs, rewards, dones, info = env.step(actions)\n",
" # Stats\n",
" for i in range(env.num_envs):\n",
" episode_rewards[i][-1] += rewards[i]\n",
" if dones[i]:\n",
" episode_rewards[i].append(0.0)\n",
" mean_rewards = [0.0 for _ in range(env.num_envs)]\n",
" n_episodes = 0\n",
" for i in range(env.num_envs):\n",
" mean_rewards[i] = np.mean(episode_rewards[i]) \n",
" n_episodes += len(episode_rewards[i]) \n",
" # Compute mean reward\n",
" mean_reward = round(np.mean(mean_rewards), 1)\n",
" print(\"Mean reward:\", mean_reward, \"Num episodes:\", n_episodes)\n",
" return mean_reward"
"cell_type": "code",
"source": [
"# Create a CNN based policy and optimize it using PPO2.\n",
"# ppo_params = {\"gamma\" : 0.99,\n",
"# \"n_steps\" : 128,\n",
"# \"ent_coef\" : 0.01,\n",
"# \"learning_rate\" : 0.00025,\n",
"# \"vf_coef\" : 0.5,\n",
"# \"max_grad_norm\" : 0.5,\n",
"# \"lam\" : 0.95,\n",
"# \"nminibatches\" : 4,\n",
"# \"noptepochs\" : 4,\n",
"# \"cliprange\" :0.2,\n",
"# \"cliprange_vf\" : None,\n",
"# \"verbose\" : 1,\n",
"# \"tensorboard_log\" : None,\n",
"# \"_init_setup_model\" : True,\n",
"# \"policy_kwargs\" : None,\n",
"# \"full_tensorboard_log\" : False,\n",
"# \"seed\" : None,\n",
"# \"n_cpu_tf_sess\" : None\n",
"# }\n",
"# params=ppo_params, "
"cell_type": "code",
"source": [
"def train_save_agent(model, env, gif_name, time_steps=int(1e4), \n",
" save_gif=False):\n",
" #use the policy, environment and define params to compile the PPO2 model..\n",
"# model = model #PPO2(\"MlpPolicy\", env)#, **params)\n",
" s_time = time.time()\n",
" #Train the model\n",
" model.learn(total_timesteps=time_steps)\n",
" e_time = time.time()\n",
" tot_time = e_time - s_time\n",
" print(f\"Total Run-Time : , {tot_time : 0.3f} seconds\")\n",
" if save_gif:\n",
" ########### Record-GIF ###########\n",
" images = []\n",
" obs = model.env.reset()\n",
" img = model.env.render(mode='rgb_array')\n",
" gif_length = 500\n",
" for i in range(gif_length):\n",
" images.append(img)\n",
" action, _ = model.predict(obs)\n",
" obs, _, _ ,_ = model.env.step(action)\n",
" img = model.env.render(mode='rgb_array')\n",
" imageio.mimsave(f'{gif_name}-{timesteps}.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0],\n",
" fps=29)\n",
" \n",
" return model, tot_time"
"cell_type": "code",
"source": [
"all_algs = [\"A2C\", \"PPO2\", \"ACER\", \"ACKTR\"]\n",
"# Create log dir\n",
"import os\n",
"log_dir = \"/tmp/gym/\"\n",
"os.makedirs(log_dir, exist_ok=True)\n",
"env_list = [\"Pendulum-v0\", \"MountainCar-v0\", \"Acrobot-v1\", \"CartPole-v1\"]\n",
"timesteps = int(1e6)\n",
"num_cpu = 4 # Number of processes to use\n",
"game_df = pd.DataFrame()\n",
"for env_id in env_list:\n",
" print(f\"{env_id}......\")\n",
"# env = gym.make(env_id) #\n",
" # Logs will be saved in log_dir/monitor.csv\n",
"# env = Monitor(env, log_dir, allow_early_resets=True)\n",
"# env = DummyVecEnv([lambda: gym.make(env_id)])\n",
" env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])\n",
" \n",
" alg_detail_df = pd.DataFrame()\n",
" for alg in all_algs:\n",
" if env_id == \"Pendulum-v0\" and alg == 'ACER':\n",
" print(f\"{env_id, alg}\")\n",
" pass\n",
" \n",
" else:\n",
" print(f'{alg}.....') \n",
" model = eval(alg + \"('MlpPolicy', env)\")\n",
" tr_model, run_time = train_save_agent(model, env, alg, time_steps=timesteps, save_gif=False)\n",
" # mean_reward, std_reward = evaluate_policy(tr_model, tr_model.get_env(), n_eval_episodes=20)\n",
" mean_reward = evaluate(tr_model, num_steps=1000)\n",
" alg_detail_df = alg_detail_df.append([[env_id, alg, run_time, mean_reward]]) #, std_reward]])\n",
" print(f\"Mean Reward : {mean_reward} \") #\"| Std_Reward : {std_reward}\")\n",
" \n",
" game_df = game_df.append(alg_detail_df) #], axis=1)\n",
" \n",
"game_df.columns = ['Envir', 'Algorithm', 'Run_Time', 'Mean_Rewards'] #, 'Std_Rewards'] "
"output_type": "stream",
"text": [
"Total Run-Time : , 257.970 seconds\n",
"Mean reward: -551.4 Num episodes: 24\n",
"Mean Reward : -551.4 \n",
"Total Run-Time : , 338.824 seconds\n",
"Mean reward: -939.9 Num episodes: 24\n",
"Mean Reward : -939.9 \n",
"('Pendulum-v0', 'ACER')\n",
"Total Run-Time : , 242.154 seconds\n",
"Mean reward: -876.8 Num episodes: 24\n",
"Mean Reward : -876.8 \n",
"Total Run-Time : , 237.251 seconds\n",
"Mean reward: -166.7 Num episodes: 24\n",
"Mean Reward : -166.7 \n",
"Total Run-Time : , 690.426 seconds\n",
"Mean reward: -166.7 Num episodes: 24\n",
"Mean Reward : -166.7 \n",
"Total Run-Time : , 346.463 seconds\n",
"Mean reward: -166.7 Num episodes: 24\n",
"Mean Reward : -166.7 \n",
"Total Run-Time : , 234.560 seconds\n",
"Mean reward: -166.7 Num episodes: 24\n",
"Mean Reward : -166.7 \n",
"Total Run-Time : , 290.163 seconds\n",
"Mean reward: -89.6 Num episodes: 46\n",
"Mean Reward : -89.6 \n",
"Total Run-Time : , 273.282 seconds\n",
"Mean reward: -74.9 Num episodes: 53\n",
"Mean Reward : -74.9 \n",
"Total Run-Time : , 392.469 seconds\n",
"Mean reward: -77.8 Num episodes: 51\n",
"Mean Reward : -77.8 \n",
"Total Run-Time : , 253.496 seconds\n",
"Mean reward: -72.3 Num episodes: 55\n",
"Mean Reward : -72.3 \n",
"Total Run-Time : , 215.154 seconds\n",
"Mean reward: 333.3 Num episodes: 12\n",
"Mean Reward : 333.3 \n",
"Total Run-Time : , 189.328 seconds\n",
"Mean reward: 333.3 Num episodes: 12\n",
"Mean Reward : 333.3 \n",
"Total Run-Time : , 300.396 seconds\n",
"Mean reward: 333.3 Num episodes: 12\n",
"Mean Reward : 333.3 \n",
"Total Run-Time : , 189.950 seconds\n",
"Mean reward: 333.3 Num episodes: 12\n",
"Mean Reward : 333.3 \n"
"cell_type": "code",
"source": [
"# game_df.to_csv('Runtime_details.csv', index=False) "
"cell_type": "code",
"source": [
"cell_type": "code",
"source": [
"cell_type": "code",
"source": [
"cell_type": "code",
"source": [
