Skip to content

Instantly share code, notes, and snippets.

Last active May 20, 2022
What would you like to do?
Experiment for showing how reciprocity can be taught
# This is a small experiment in learning reciprocity. This is part of my research, details here:
# Feel free to run this experiment, and also play with the constants defined below.
# First install the dependencies:
# pip install gym numpy stable-baselines3
# Run this script, and you'll see this output:
# Starting reciprocity experiment.
# Evaluating score... Done.
# Score before training: -14.05
# Sample game before training: CCCCDCDDDC
# Training for 3000 steps... Done.
# Evaluating score... Done.
# Score after training: -12.24
# Sample game after training: CCCCCCCCCC
# What's happening here? We have a learning agent playing 10 rounds of Prisoner's Dilemma against a
# hardcoded Tit-For-Tat opponent. The magic property that we're interested in here is reciprocity,
# and that's demonstrated by the hardcoded Tit-For-Tat opponent. That opponent plays Cooperate on
# the first round, and then on any subsequent round it plays the same move that the learning player
# played on the previous round.
# This experiment shows how reciprocity can be taught. When our learning player first plays, it
# plays an arbitrary sequence of Cooperate and Defect actions. When it's trained, it learns by
# trial-and-error that the only way to win here is to cooperate back. When it's finished training,
# it plays only Cooperate. It increases its score from around -14 to around -12.
# The next step is to get agents to learn reciprocity without using a hardcoded agent:
# Sign up to get updates about my research here:
import os
import logging
from typing import Tuple
# Avoid TensorFlow spam:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import numpy as np
import gym.spaces
import stable_baselines3.common.evaluation
import stable_baselines3.common.monitor
class TitForTatEnv(gym.Env):
def __init__(self) -> None:
self.action_space = gym.spaces.Discrete(2)
self.observation_space = gym.spaces.MultiDiscrete([N_ROUNDS + 1, 3])
def reset(self) -> np.ndarray:
self.i_round = 0
self.last_opponent_move = 2
self.last_move = None
self.is_end = False
return self.get_observation()
def get_observation(self) -> np.ndarray:
return np.array([self.i_round, self.last_opponent_move])
def step(self, action: int) -> Tuple[np.ndarray, float, bool, dict]:
self.last_opponent_move = self.last_move if self.i_round >= 1 else 1
self.last_move = action
self.i_round += 1
self.is_end = (self.i_round == N_ROUNDS)
if self.last_move == 0:
reward = 0 if (self.last_opponent_move == 1) else -2
reward = -1 if (self.last_opponent_move == 1) else -3
return self.get_observation(), reward, self.is_end, {}
def get_sample_game(policy: stable_baselines3.common.base_class.BaseAlgorithm,
env: gym.Env) -> str:
observation = env.reset()
result = ''
while not env.is_end:
action, _state = policy.predict(observation, deterministic=True)
result += 'DC'[action]
observation, reward, done, info = env.step(action)
return result
def reciprocity():
print('Starting reciprocity experiment.')
gym.envs.register('TitForTat-v0', entry_point='reciprocity:TitForTatEnv')
env = stable_baselines3.common.monitor.Monitor(gym.make('TitForTat-v0'))
policy = stable_baselines3.PPO('MlpPolicy', env, verbose=False)
get_score = lambda: stable_baselines3.common.evaluation.evaluate_policy(
policy, env, n_eval_episodes=1_000, deterministic=False)[0]
print('Evaluating score... ', end='')
score_before_training = get_score()
print(f'Score before training: {score_before_training:.2f}')
print(f'Sample game before training: {get_sample_game(policy, env)}')
print(f'\nTraining for {N_TRAINING_STEPS} steps... ', end='')
print('Evaluating score... ', end='')
score_after_training = get_score()
print(f'Score after training: {score_after_training:.2f}')
print(f'Sample game after training: {get_sample_game(policy, env)}')
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment