Last active
November 30, 2023 01:36
-
-
Save kirillbobyrev/c16ef00e01370e9789d1d6b91fe63442 to your computer and use it in GitHub Desktop.
Scripts for "Analyzing long win streaks in online chess"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pandas as pd | |
import numpy as np | |
from datetime import datetime | |
import matplotlib.pyplot as plt | |
from collections import Counter | |
PLAYER = "Hikaru" | |
START_DATE = "2023-01-01" | |
END_DATE = "2023-11-28" | |
LONG_STREAK = 20 | |
MEDIUM_STREAK = 15 | |
SHORT_STREAK = 10 | |
headers = {"user-agent": "kirillbobyrev/1.0.0"} | |
def get_player_games(player: str, year: str, month: str): | |
r = requests.get( | |
f"https://api.chess.com/pub/player/{player}/games/{year}/{month}", | |
headers=headers, | |
) | |
games = r.json()["games"] | |
rows = [] | |
for game in games: | |
result = 0 | |
if game["white"]["result"] == "win": | |
result = 1 | |
if game["black"]["result"] == "win": | |
result = -1 | |
row = ( | |
game["end_time"], | |
game["url"], | |
result, | |
game["white"]["username"], | |
game["black"]["username"], | |
game["white"]["rating"], | |
game["black"]["rating"], | |
game["time_control"], | |
game["rated"], | |
game["rules"], | |
game["black"]["rating"] | |
if game["white"]["username"] == PLAYER | |
else game["white"]["rating"], | |
game["black"]["rating"] | |
if game["white"]["username"] != PLAYER | |
else game["white"]["rating"], | |
) | |
rows.append(row) | |
df = pd.DataFrame( | |
rows, | |
columns=[ | |
"end_time", | |
"url", | |
"result", | |
"white", | |
"black", | |
"white_rating", | |
"black_rating", | |
"time_control", | |
"rated", | |
"rules", | |
f"{PLAYER}_opponent_rating", | |
f"{PLAYER}_rating", | |
], | |
) | |
return df.convert_dtypes() | |
# Get player's games archives. | |
df = pd.concat( | |
[ | |
get_player_games(PLAYER, "2023", "11"), | |
get_player_games(PLAYER, "2023", "10"), | |
get_player_games(PLAYER, "2023", "09"), | |
get_player_games(PLAYER, "2023", "08"), | |
get_player_games(PLAYER, "2023", "07"), | |
get_player_games(PLAYER, "2023", "06"), | |
get_player_games(PLAYER, "2023", "05"), | |
get_player_games(PLAYER, "2023", "04"), | |
get_player_games(PLAYER, "2023", "03"), | |
get_player_games(PLAYER, "2023", "02"), | |
get_player_games(PLAYER, "2023", "01"), | |
] | |
) | |
# Filter games that are 3+0 blitz, rated, standard chess and between START_DATE | |
# and END_DATE. | |
print(f"Finding games from {START_DATE} to {END_DATE}") | |
df = ( | |
df[ | |
(df["time_control"] == "180") | |
& (df["rules"] == "chess") | |
& (df["end_time"] > datetime.strptime(START_DATE, "%Y-%m-%d").timestamp()) | |
& (df["end_time"] < datetime.strptime(END_DATE, "%Y-%m-%d").timestamp()) | |
& df["rated"] | |
] | |
.set_index("end_time") | |
.sort_index() | |
) | |
# Add a column that indicates whether the player won the game. | |
df[f"{PLAYER}_won"] = np.where( | |
( | |
((df["white"] == PLAYER) & (df["result"] == 1)) | |
| ((df["black"] == PLAYER) & (df["result"] == -1)) | |
), | |
True, | |
False, | |
) | |
# Print some statistics. | |
total_games = len(df) | |
print(f"{PLAYER} has played {total_games} filtered games in selected time period.") | |
won = df[f"{PLAYER}_won"].sum() | |
print(f"{PLAYER} won {won} out of these games.") | |
drew = df["result"].value_counts()[0] | |
print(f"{PLAYER} drew {drew} out of these games.") | |
print( | |
f"{PLAYER} scored a total of {won + drew / 2} points out of {total_games} " | |
"possible." | |
) | |
# Print some rating statistics. | |
mean_opponent_rating = df[f"{PLAYER}_opponent_rating"].mean() | |
print(f"The average opponent rating is {mean_opponent_rating:.2f}") | |
mean_player_rating = df[f"{PLAYER}_rating"].mean() | |
print(f"{PLAYER} average rating is {mean_player_rating:.2f}") | |
# Calculate opponent rating percentiles. | |
percentile_25 = df[f"{PLAYER}_opponent_rating"].quantile(0.25) | |
percentile_50 = df[f"{PLAYER}_opponent_rating"].quantile(0.50) | |
percentile_75 = df[f"{PLAYER}_opponent_rating"].quantile(0.75) | |
percentile_90 = df[f"{PLAYER}_opponent_rating"].quantile(0.90) | |
print( | |
f"25%: {percentile_25:.2f}, 50%: {percentile_50:.2f}, 75%: {percentile_75:.2f}, 90%: {percentile_90:.2f}" | |
) | |
# Plot opponent rating distribution. | |
plot = df[f"{PLAYER}_opponent_rating"].plot.hist(bins=75, label="density", density=True) | |
plt.title(f"Opponent rating distribution for {PLAYER}") | |
plot.set_xlabel("Chess.com rating") | |
plot.set_ylabel("Number of times encountered") | |
plot.axvline( | |
mean_opponent_rating, | |
color="k", | |
linestyle="dashed", | |
linewidth=2, | |
label=f"mean = {int(mean_opponent_rating)}", | |
) | |
plot.axvline( | |
percentile_25, | |
color="r", | |
linestyle="dashed", | |
linewidth=2, | |
label=f"25% = {int(percentile_25)}", | |
) | |
plot.axvline( | |
percentile_50, | |
color="r", | |
linestyle="dashed", | |
linewidth=2, | |
label=f"50% = {int(percentile_50)}", | |
) | |
plot.axvline( | |
percentile_75, | |
color="r", | |
linestyle="dashed", | |
linewidth=2, | |
label=f"75% = {int(percentile_75)}", | |
) | |
plot.axvline( | |
percentile_90, | |
color="r", | |
linestyle="dashed", | |
linewidth=2, | |
label=f"90% = {int(percentile_90)}", | |
) | |
plot.legend() | |
plt.xlim(2200, 3300) | |
fig = plot.get_figure() | |
fig.savefig(f"{PLAYER}_opponent_ratings.png") | |
# Find longest winning streaks. | |
streak = 0 | |
ratings = [] | |
player_starting_rating = None | |
first_url = None | |
streaks = [] | |
for result, opponent_rating, player_rating, url in zip( | |
df[f"{PLAYER}_won"], | |
df[f"{PLAYER}_opponent_rating"], | |
df[f"{PLAYER}_rating"], | |
df["url"], | |
): | |
if result: | |
streak += 1 | |
ratings.append(opponent_rating) | |
if player_starting_rating is None: | |
player_starting_rating = player_rating | |
if first_url is None: | |
first_url = url | |
else: | |
if streak > 0: | |
streaks.append( | |
(streak, np.mean(ratings), player_starting_rating, first_url) | |
) | |
streak = 0 | |
ratings = [] | |
first_url = None | |
player_starting_rating = None | |
if streak > 0: | |
streaks.append((streak, np.mean(ratings), player_starting_rating, first_url)) | |
TOP_K = 10 | |
print(f"{PLAYER}'s top {TOP_K} winning streaks:") | |
for streak, average_opponent_rating, player_rating, url in sorted( | |
streaks, reverse=True | |
)[:TOP_K]: | |
print( | |
f"{streak} games, average opponent rating " | |
f"{average_opponent_rating:.2f}, starting {PLAYER} rating " | |
f"{player_rating:.2f}, starting at {url}" | |
) | |
streak_lengths = list(map(lambda item: item[0], streaks)) | |
print("Streaks:") | |
print(sorted(streak_lengths, reverse=True)[:30]) | |
short_streaks = Counter(i >= SHORT_STREAK for i in streak_lengths)[True] | |
print(f"Short streaks of {SHORT_STREAK}+ wins: {short_streaks}") | |
medium_streaks = Counter(i >= MEDIUM_STREAK for i in streak_lengths)[True] | |
print(f"Medium streaks of {MEDIUM_STREAK}+ wins: {medium_streaks}") | |
long_streaks = Counter(i >= LONG_STREAK for i in streak_lengths)[True] | |
print(f"Short streaks of {LONG_STREAK}+ wins: {long_streaks}") | |
trimmed_lengths = list(filter(lambda length: length >= 5, streak_lengths)) | |
plt.clf() | |
plt.hist(trimmed_lengths, bins=20) | |
plt.xlim(5, 60) | |
plt.savefig("win_streaks_distribution.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdlib> | |
#include <iostream> | |
#include <random> | |
using namespace std; | |
constexpr int kNumSimulationRunsPerBatch = 100'000; | |
// Number of games the player played this year. | |
constexpr int kNumTrials = 908; | |
// Minimum length of of wins to count as a streak. | |
constexpr int kStreakMinLength = 32; | |
// Minimum number of streaks to count as a success. | |
constexpr int kMinStreaks = 1; | |
// Probability of winning a game calculated using Elo win probability formula. | |
constexpr double kWinProbability = .83; | |
default_random_engine generator; | |
uniform_real_distribution<double> distribution(0.0, 1.0); | |
// Returns whether or not the player won game. | |
bool sample_win() { return distribution(generator) < kWinProbability; } | |
// Returns whether or not the sequence of kNumTrials games contains kMinStreaks | |
// or more streaks of kMinStreakLength wins. | |
bool sample_sequence() { | |
int streak = 0; | |
int num_streaks = 0; | |
for (int i = 0; i < kNumTrials; ++i) { | |
if (sample_win()) { | |
++streak; | |
continue; | |
} | |
if (streak >= kStreakMinLength) { | |
++num_streaks; | |
} | |
streak = 0; | |
} | |
if (streak >= kStreakMinLength) { | |
++num_streaks; | |
} | |
return num_streaks >= kMinStreaks; | |
} | |
int main() { | |
srand(time(NULL)); | |
cout << "Starting Monte Carlo simulation." << endl; | |
cout << "Number of simulations per batch: " << kNumSimulationRunsPerBatch | |
<< endl; | |
cout << "Number of trials: " << kNumTrials << endl; | |
cout << "Minimum streak length: " << kStreakMinLength << endl; | |
cout << "Minimum number of streaks: " << kMinStreaks << endl; | |
cout << "Win probability: " << kWinProbability << endl; | |
long long int sequences = 0; | |
long long int successes = 0; | |
while (true) { | |
for (int i = 0; i < kNumSimulationRunsPerBatch; ++i) { | |
bool had_enough_streaks = sample_sequence(); | |
++sequences; | |
if (had_enough_streaks) { | |
++successes; | |
} | |
} | |
cout << "Simulations: " << sequences << " with " << kMinStreaks | |
<< " or more " << kStreakMinLength | |
<< "-game win streaks: " << successes << " probability: " | |
<< static_cast<double>(successes) / static_cast<double>(sequences) | |
<< endl; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment