Created
January 25, 2024 02:15
-
-
Save N-McA/23eb659dd5866af6358abb2b67eb1409 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import dataclasses | |
import itertools | |
import typing as tp | |
from functools import partial | |
import numpy as np | |
from scipy.optimize import minimize | |
DEFAULT_CI_SIZE = 0.95 | |
@dataclasses.dataclass | |
class CI: | |
lcb: tp.Any | |
mean: tp.Any | |
ucb: tp.Any | |
size: tp.Any | |
def to_dict(self): | |
return dict(self.__dict__) | |
T = tp.TypeVar("T") | |
def elo_ci( | |
rankings: tp.Sequence[tp.Sequence[tuple[T, float]]], | |
zero_reference: T, | |
size: float = DEFAULT_CI_SIZE, | |
rng: np.random.RandomState | None = None, | |
min_games: int = 3, | |
num_bootstrap_samples: int = 1_000, | |
) -> dict[T, CI]: | |
"""Estimate elo from rankings. | |
This does a nonparametric bootstrap of elo. | |
Args: | |
rankings: Each element is a sequence of (player, rank) pairs. The *lowest rank* player is the winner. So one if you have one matchup in which A beat B you'd have: rankings=[[(A, 0), (B, 1)]] | |
zero_reference: The policy to use as the zero reference. | |
Returns: | |
A dict mapping from policy to elo CI. | |
""" | |
# ze recipe: | |
# 1. for each bootstrap element: | |
# a. compute pairwise win/losses from matchups. | |
# b. solve elo loss | |
# 2. adjust for zero_reference | |
# 3. compute CI from bootstrapped elo values. | |
rng = rng or np.random.RandomState() | |
players = collections.Counter() | |
for ranking in rankings: | |
players.update([p for p, _ in ranking]) | |
if len(ranking) < 2: | |
raise ValueError(f"Ranking {ranking} contains less than 2 players.") | |
for player in players: | |
if players[player] < min_games: | |
raise ValueError( | |
f"Player {player} has {players[player]} games, but we require at least {min_games} for an elo estimate that is sensible." | |
) | |
if zero_reference not in players: | |
raise ValueError(f"{zero_reference=} not in {players=}") | |
# Make an order: | |
players = list(players.keys()) | |
num_players = len(players) | |
def loss_and_gradient(elos, n_wins): | |
# elo loss | |
elo_deltas = np.reshape(elos, (num_players, 1)) - np.reshape(elos, (1, num_players)) | |
predicted_wins = 1.0 / (1.0 + np.exp(-elo_deltas)) | |
loss_penalties = -np.log(predicted_wins) | |
loss = np.sum(loss_penalties * n_wins) | |
# analytic gradient | |
dloss_dpredicted_wins = -n_wins / predicted_wins | |
dpredicted_wins_delo_deltas = predicted_wins * (1.0 - predicted_wins) | |
dloss_delo_deltas = dloss_dpredicted_wins * dpredicted_wins_delo_deltas | |
dloss_delos_positive = np.sum(dloss_delo_deltas, axis=1) | |
dloss_delos_negative = -np.sum(dloss_delo_deltas, axis=0) | |
dloss_delos = dloss_delos_positive + dloss_delos_negative | |
gradient = dloss_delos | |
return loss, gradient | |
# pre-compute the all-pairs matchups: | |
int_rankings = [] | |
for ranking in rankings: | |
r = [] | |
for (player_a, rank_a), (player_b, rank_b) in itertools.combinations(ranking, 2): | |
if player_a == player_b: | |
continue | |
if rank_a < rank_b: | |
r.append([players.index(player_a), players.index(player_b), 1]) | |
elif rank_b < rank_a: | |
r.append([players.index(player_b), players.index(player_a), 1]) | |
else: | |
# A draw is half a win each. | |
r.append([players.index(player_a), players.index(player_b), 0.5]) | |
r.append([players.index(player_b), players.index(player_a), 0.5]) | |
if len(r) > 0: | |
int_rankings.append(r) | |
def fill_n_wins_with_random_rankings(): | |
for ranking_idx in rng.choice( | |
len(int_rankings), | |
size=len(int_rankings), | |
replace=True, | |
): | |
for p0, p1, d in int_rankings[ranking_idx]: | |
n_wins[p0, p1] += d | |
elos = np.zeros([num_bootstrap_samples, num_players]) | |
n_wins = np.zeros([num_players, num_players]) | |
for i in range(num_bootstrap_samples): | |
attempts = 0 | |
# This is a nasty heuristic that may cause incorrect results in adversarial cases: | |
while attempts < 3: | |
attempts += 1 | |
n_wins.fill(0) | |
fill_n_wins_with_random_rankings() | |
result = minimize( | |
partial(loss_and_gradient, n_wins=n_wins), | |
jac=True, | |
x0=np.zeros([num_players]), | |
) | |
if result.success: | |
break | |
if not result.success: | |
raise ValueError(f"minimization failed: {result=}") | |
elos[i] = result.x | |
# Normalize to reference policy. | |
zero = elos[:, players.index(zero_reference)] | |
elos -= zero[:, np.newaxis] | |
# These are "natural" elos. | |
# convert to chess elo: | |
elos *= 400 / np.log(10) | |
lcb = np.percentile(elos, (1 - size) / 2 * 100, axis=0) | |
mean = np.mean(elos, axis=0) | |
ucb = np.percentile(elos, (1 + size) / 2 * 100, axis=0) | |
result = {} | |
for i, player in enumerate(players): | |
result[player] = CI( | |
lcb=lcb[i], | |
mean=mean[i], | |
ucb=ucb[i], | |
size=size, | |
) | |
return result | |
def implied_winrate(elos): | |
"""Using the 10^(d/400) formula from chess.""" | |
return 1.0 / (1.0 + 10 ** (-elos / 400)) | |
if __name__ == "__main__": | |
# Kick tires: | |
rankings = [ | |
[("A", 0), ("B", 1)], | |
[("A", 0), ("B", 1), ("C", 2)], | |
[("A", 0), ("B", 0), ("C", 2)], | |
[("A", 0), ("B", 0), ("C", 2)], | |
[("A", 0), ("AA", 1)], | |
[("A", 1), ("AA", 0)], | |
] * 20 | |
elo = elo_ci( | |
rankings, zero_reference="A", rng=np.random.RandomState(42), num_bootstrap_samples=10 | |
) | |
print(elo) | |
assert elo["A"].mean == 0 | |
assert elo["B"].mean < 0 | |
assert elo["C"].mean < elo["B"].mean | |
assert -5 < elo["AA"].mean < 5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment