N-McA/elo.py

## elo.py
import collections
import dataclasses
import itertools
import typing as tp
from functools import partial

import numpy as np
from scipy.optimize import minimize

DEFAULT_CI_SIZE = 0.95


@dataclasses.dataclass
class CI:
    lcb: tp.Any
    mean: tp.Any
    ucb: tp.Any
    size: tp.Any

    def to_dict(self):
        return dict(self.__dict__)


T = tp.TypeVar("T")


def elo_ci(
    rankings: tp.Sequence[tp.Sequence[tuple[T, float]]],
    zero_reference: T,
    size: float = DEFAULT_CI_SIZE,
    rng: np.random.RandomState | None = None,
    min_games: int = 3,
    num_bootstrap_samples: int = 1_000,
) -> dict[T, CI]:
    """Estimate elo from rankings.

    This does a nonparametric bootstrap of elo.

    Args:
        rankings: Each element is a sequence of (player, rank) pairs. The *lowest rank* player is the winner. So one if you have one matchup in which A beat B you'd have: rankings=[[(A, 0), (B, 1)]]
        zero_reference: The policy to use as the zero reference.

    Returns:
        A dict mapping from policy to elo CI.
    """

    # ze recipe:
    # 1. for each bootstrap element:
    #    a. compute pairwise win/losses from matchups.
    #    b. solve elo loss
    # 2. adjust for zero_reference
    # 3. compute CI from bootstrapped elo values.

    rng = rng or np.random.RandomState()

    players = collections.Counter()
    for ranking in rankings:
        players.update([p for p, _ in ranking])
        if len(ranking) < 2:
            raise ValueError(f"Ranking {ranking} contains less than 2 players.")
    for player in players:
        if players[player] < min_games:
            raise ValueError(
                f"Player {player} has {players[player]} games, but we require at least {min_games} for an elo estimate that is sensible."
            )
    if zero_reference not in players:
        raise ValueError(f"{zero_reference=} not in {players=}")

    # Make an order:
    players = list(players.keys())
    num_players = len(players)

    def loss_and_gradient(elos, n_wins):
        # elo loss
        elo_deltas = np.reshape(elos, (num_players, 1)) - np.reshape(elos, (1, num_players))
        predicted_wins = 1.0 / (1.0 + np.exp(-elo_deltas))
        loss_penalties = -np.log(predicted_wins)
        loss = np.sum(loss_penalties * n_wins)

        # analytic gradient
        dloss_dpredicted_wins = -n_wins / predicted_wins
        dpredicted_wins_delo_deltas = predicted_wins * (1.0 - predicted_wins)
        dloss_delo_deltas = dloss_dpredicted_wins * dpredicted_wins_delo_deltas
        dloss_delos_positive = np.sum(dloss_delo_deltas, axis=1)
        dloss_delos_negative = -np.sum(dloss_delo_deltas, axis=0)
        dloss_delos = dloss_delos_positive + dloss_delos_negative

        gradient = dloss_delos

        return loss, gradient

    # pre-compute the all-pairs matchups:

    int_rankings = []
    for ranking in rankings:
        r = []
        for (player_a, rank_a), (player_b, rank_b) in itertools.combinations(ranking, 2):
            if player_a == player_b:
                continue
            if rank_a < rank_b:
                r.append([players.index(player_a), players.index(player_b), 1])
            elif rank_b < rank_a:
                r.append([players.index(player_b), players.index(player_a), 1])
            else:
                # A draw is half a win each.
                r.append([players.index(player_a), players.index(player_b), 0.5])
                r.append([players.index(player_b), players.index(player_a), 0.5])
        if len(r) > 0:
            int_rankings.append(r)

    def fill_n_wins_with_random_rankings():
        for ranking_idx in rng.choice(
            len(int_rankings),
            size=len(int_rankings),
            replace=True,
        ):
            for p0, p1, d in int_rankings[ranking_idx]:
                n_wins[p0, p1] += d

    elos = np.zeros([num_bootstrap_samples, num_players])
    n_wins = np.zeros([num_players, num_players])
    for i in range(num_bootstrap_samples):
        attempts = 0
        # This is a nasty heuristic that may cause incorrect results in adversarial cases:
        while attempts < 3:
            attempts += 1
            n_wins.fill(0)
            fill_n_wins_with_random_rankings()
            result = minimize(
                partial(loss_and_gradient, n_wins=n_wins),
                jac=True,
                x0=np.zeros([num_players]),
            )
            if result.success:
                break
        if not result.success:
            raise ValueError(f"minimization failed: {result=}")
        elos[i] = result.x

    # Normalize to reference policy.
    zero = elos[:, players.index(zero_reference)]
    elos -= zero[:, np.newaxis]

    # These are "natural" elos.
    # convert to chess elo:
    elos *= 400 / np.log(10)

    lcb = np.percentile(elos, (1 - size) / 2 * 100, axis=0)
    mean = np.mean(elos, axis=0)
    ucb = np.percentile(elos, (1 + size) / 2 * 100, axis=0)

    result = {}
    for i, player in enumerate(players):
        result[player] = CI(
            lcb=lcb[i],
            mean=mean[i],
            ucb=ucb[i],
            size=size,
        )

    return result


def implied_winrate(elos):
    """Using the 10^(d/400) formula from chess."""
    return 1.0 / (1.0 + 10 ** (-elos / 400))


if __name__ == "__main__":
    # Kick tires:
    rankings = [
        [("A", 0), ("B", 1)],
        [("A", 0), ("B", 1), ("C", 2)],
        [("A", 0), ("B", 0), ("C", 2)],
        [("A", 0), ("B", 0), ("C", 2)],
        [("A", 0), ("AA", 1)],
        [("A", 1), ("AA", 0)],
    ] * 20
    elo = elo_ci(
        rankings, zero_reference="A", rng=np.random.RandomState(42), num_bootstrap_samples=10
    )
    print(elo)
    assert elo["A"].mean == 0
    assert elo["B"].mean < 0
    assert elo["C"].mean < elo["B"].mean
    assert -5 < elo["AA"].mean < 5
	import collections
	import dataclasses
	import itertools
	import typing as tp
	from functools import partial

	import numpy as np
	from scipy.optimize import minimize

	DEFAULT_CI_SIZE = 0.95


	@dataclasses.dataclass
	class CI:
	lcb: tp.Any
	mean: tp.Any
	ucb: tp.Any
	size: tp.Any

	def to_dict(self):
	return dict(self.__dict__)


	T = tp.TypeVar("T")


	def elo_ci(
	rankings: tp.Sequence[tp.Sequence[tuple[T, float]]],
	zero_reference: T,
	size: float = DEFAULT_CI_SIZE,
	rng: np.random.RandomState \| None = None,
	min_games: int = 3,
	num_bootstrap_samples: int = 1_000,
	) -> dict[T, CI]:
	"""Estimate elo from rankings.

	This does a nonparametric bootstrap of elo.

	Args:
	rankings: Each element is a sequence of (player, rank) pairs. The lowest rank player is the winner. So one if you have one matchup in which A beat B you'd have: rankings=[[(A, 0), (B, 1)]]
	zero_reference: The policy to use as the zero reference.

	Returns:
	A dict mapping from policy to elo CI.
	"""

	# ze recipe:
	# 1. for each bootstrap element:
	# a. compute pairwise win/losses from matchups.
	# b. solve elo loss
	# 2. adjust for zero_reference
	# 3. compute CI from bootstrapped elo values.

	rng = rng or np.random.RandomState()

	players = collections.Counter()
	for ranking in rankings:
	players.update([p for p, _ in ranking])
	if len(ranking) < 2:
	raise ValueError(f"Ranking {ranking} contains less than 2 players.")
	for player in players:
	if players[player] < min_games:
	raise ValueError(
	f"Player {player} has {players[player]} games, but we require at least {min_games} for an elo estimate that is sensible."
	)
	if zero_reference not in players:
	raise ValueError(f"{zero_reference=} not in {players=}")

	# Make an order:
	players = list(players.keys())
	num_players = len(players)

	def loss_and_gradient(elos, n_wins):
	# elo loss
	elo_deltas = np.reshape(elos, (num_players, 1)) - np.reshape(elos, (1, num_players))
	predicted_wins = 1.0 / (1.0 + np.exp(-elo_deltas))
	loss_penalties = -np.log(predicted_wins)
	loss = np.sum(loss_penalties * n_wins)

	# analytic gradient
	dloss_dpredicted_wins = -n_wins / predicted_wins
	dpredicted_wins_delo_deltas = predicted_wins * (1.0 - predicted_wins)
	dloss_delo_deltas = dloss_dpredicted_wins * dpredicted_wins_delo_deltas
	dloss_delos_positive = np.sum(dloss_delo_deltas, axis=1)
	dloss_delos_negative = -np.sum(dloss_delo_deltas, axis=0)
	dloss_delos = dloss_delos_positive + dloss_delos_negative

	gradient = dloss_delos

	return loss, gradient

	# pre-compute the all-pairs matchups:

	int_rankings = []
	for ranking in rankings:
	r = []
	for (player_a, rank_a), (player_b, rank_b) in itertools.combinations(ranking, 2):
	if player_a == player_b:
	continue
	if rank_a < rank_b:
	r.append([players.index(player_a), players.index(player_b), 1])
	elif rank_b < rank_a:
	r.append([players.index(player_b), players.index(player_a), 1])
	else:
	# A draw is half a win each.
	r.append([players.index(player_a), players.index(player_b), 0.5])
	r.append([players.index(player_b), players.index(player_a), 0.5])
	if len(r) > 0:
	int_rankings.append(r)

	def fill_n_wins_with_random_rankings():
	for ranking_idx in rng.choice(
	len(int_rankings),
	size=len(int_rankings),
	replace=True,
	):
	for p0, p1, d in int_rankings[ranking_idx]:
	n_wins[p0, p1] += d

	elos = np.zeros([num_bootstrap_samples, num_players])
	n_wins = np.zeros([num_players, num_players])
	for i in range(num_bootstrap_samples):
	attempts = 0
	# This is a nasty heuristic that may cause incorrect results in adversarial cases:
	while attempts < 3:
	attempts += 1
	n_wins.fill(0)
	fill_n_wins_with_random_rankings()
	result = minimize(
	partial(loss_and_gradient, n_wins=n_wins),
	jac=True,
	x0=np.zeros([num_players]),
	)
	if result.success:
	break
	if not result.success:
	raise ValueError(f"minimization failed: {result=}")
	elos[i] = result.x

	# Normalize to reference policy.
	zero = elos[:, players.index(zero_reference)]
	elos -= zero[:, np.newaxis]

	# These are "natural" elos.
	# convert to chess elo:
	elos *= 400 / np.log(10)

	lcb = np.percentile(elos, (1 - size) / 2 * 100, axis=0)
	mean = np.mean(elos, axis=0)
	ucb = np.percentile(elos, (1 + size) / 2 * 100, axis=0)

	result = {}
	for i, player in enumerate(players):
	result[player] = CI(
	lcb=lcb[i],
	mean=mean[i],
	ucb=ucb[i],
	size=size,
	)

	return result


	def implied_winrate(elos):
	"""Using the 10^(d/400) formula from chess."""
	return 1.0 / (1.0 + 10 ** (-elos / 400))


	if __name__ == "__main__":
	# Kick tires:
	rankings = [
	[("A", 0), ("B", 1)],
	[("A", 0), ("B", 1), ("C", 2)],
	[("A", 0), ("B", 0), ("C", 2)],
	[("A", 0), ("B", 0), ("C", 2)],
	[("A", 0), ("AA", 1)],
	[("A", 1), ("AA", 0)],
	] * 20
	elo = elo_ci(
	rankings, zero_reference="A", rng=np.random.RandomState(42), num_bootstrap_samples=10
	)
	print(elo)
	assert elo["A"].mean == 0
	assert elo["B"].mean < 0
	assert elo["C"].mean < elo["B"].mean
	assert -5 < elo["AA"].mean < 5