Skip to content

Instantly share code, notes, and snippets.

@N-McA
Created January 25, 2024 02:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save N-McA/23eb659dd5866af6358abb2b67eb1409 to your computer and use it in GitHub Desktop.
Save N-McA/23eb659dd5866af6358abb2b67eb1409 to your computer and use it in GitHub Desktop.
import collections
import dataclasses
import itertools
import typing as tp
from functools import partial
import numpy as np
from scipy.optimize import minimize
DEFAULT_CI_SIZE = 0.95
@dataclasses.dataclass
class CI:
lcb: tp.Any
mean: tp.Any
ucb: tp.Any
size: tp.Any
def to_dict(self):
return dict(self.__dict__)
T = tp.TypeVar("T")
def elo_ci(
rankings: tp.Sequence[tp.Sequence[tuple[T, float]]],
zero_reference: T,
size: float = DEFAULT_CI_SIZE,
rng: np.random.RandomState | None = None,
min_games: int = 3,
num_bootstrap_samples: int = 1_000,
) -> dict[T, CI]:
"""Estimate elo from rankings.
This does a nonparametric bootstrap of elo.
Args:
rankings: Each element is a sequence of (player, rank) pairs. The *lowest rank* player is the winner. So one if you have one matchup in which A beat B you'd have: rankings=[[(A, 0), (B, 1)]]
zero_reference: The policy to use as the zero reference.
Returns:
A dict mapping from policy to elo CI.
"""
# ze recipe:
# 1. for each bootstrap element:
# a. compute pairwise win/losses from matchups.
# b. solve elo loss
# 2. adjust for zero_reference
# 3. compute CI from bootstrapped elo values.
rng = rng or np.random.RandomState()
players = collections.Counter()
for ranking in rankings:
players.update([p for p, _ in ranking])
if len(ranking) < 2:
raise ValueError(f"Ranking {ranking} contains less than 2 players.")
for player in players:
if players[player] < min_games:
raise ValueError(
f"Player {player} has {players[player]} games, but we require at least {min_games} for an elo estimate that is sensible."
)
if zero_reference not in players:
raise ValueError(f"{zero_reference=} not in {players=}")
# Make an order:
players = list(players.keys())
num_players = len(players)
def loss_and_gradient(elos, n_wins):
# elo loss
elo_deltas = np.reshape(elos, (num_players, 1)) - np.reshape(elos, (1, num_players))
predicted_wins = 1.0 / (1.0 + np.exp(-elo_deltas))
loss_penalties = -np.log(predicted_wins)
loss = np.sum(loss_penalties * n_wins)
# analytic gradient
dloss_dpredicted_wins = -n_wins / predicted_wins
dpredicted_wins_delo_deltas = predicted_wins * (1.0 - predicted_wins)
dloss_delo_deltas = dloss_dpredicted_wins * dpredicted_wins_delo_deltas
dloss_delos_positive = np.sum(dloss_delo_deltas, axis=1)
dloss_delos_negative = -np.sum(dloss_delo_deltas, axis=0)
dloss_delos = dloss_delos_positive + dloss_delos_negative
gradient = dloss_delos
return loss, gradient
# pre-compute the all-pairs matchups:
int_rankings = []
for ranking in rankings:
r = []
for (player_a, rank_a), (player_b, rank_b) in itertools.combinations(ranking, 2):
if player_a == player_b:
continue
if rank_a < rank_b:
r.append([players.index(player_a), players.index(player_b), 1])
elif rank_b < rank_a:
r.append([players.index(player_b), players.index(player_a), 1])
else:
# A draw is half a win each.
r.append([players.index(player_a), players.index(player_b), 0.5])
r.append([players.index(player_b), players.index(player_a), 0.5])
if len(r) > 0:
int_rankings.append(r)
def fill_n_wins_with_random_rankings():
for ranking_idx in rng.choice(
len(int_rankings),
size=len(int_rankings),
replace=True,
):
for p0, p1, d in int_rankings[ranking_idx]:
n_wins[p0, p1] += d
elos = np.zeros([num_bootstrap_samples, num_players])
n_wins = np.zeros([num_players, num_players])
for i in range(num_bootstrap_samples):
attempts = 0
# This is a nasty heuristic that may cause incorrect results in adversarial cases:
while attempts < 3:
attempts += 1
n_wins.fill(0)
fill_n_wins_with_random_rankings()
result = minimize(
partial(loss_and_gradient, n_wins=n_wins),
jac=True,
x0=np.zeros([num_players]),
)
if result.success:
break
if not result.success:
raise ValueError(f"minimization failed: {result=}")
elos[i] = result.x
# Normalize to reference policy.
zero = elos[:, players.index(zero_reference)]
elos -= zero[:, np.newaxis]
# These are "natural" elos.
# convert to chess elo:
elos *= 400 / np.log(10)
lcb = np.percentile(elos, (1 - size) / 2 * 100, axis=0)
mean = np.mean(elos, axis=0)
ucb = np.percentile(elos, (1 + size) / 2 * 100, axis=0)
result = {}
for i, player in enumerate(players):
result[player] = CI(
lcb=lcb[i],
mean=mean[i],
ucb=ucb[i],
size=size,
)
return result
def implied_winrate(elos):
"""Using the 10^(d/400) formula from chess."""
return 1.0 / (1.0 + 10 ** (-elos / 400))
if __name__ == "__main__":
# Kick tires:
rankings = [
[("A", 0), ("B", 1)],
[("A", 0), ("B", 1), ("C", 2)],
[("A", 0), ("B", 0), ("C", 2)],
[("A", 0), ("B", 0), ("C", 2)],
[("A", 0), ("AA", 1)],
[("A", 1), ("AA", 0)],
] * 20
elo = elo_ci(
rankings, zero_reference="A", rng=np.random.RandomState(42), num_bootstrap_samples=10
)
print(elo)
assert elo["A"].mean == 0
assert elo["B"].mean < 0
assert elo["C"].mean < elo["B"].mean
assert -5 < elo["AA"].mean < 5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment