kirillbobyrev/eda.py

## eda.py
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from collections import Counter

PLAYER = "Hikaru"
START_DATE = "2023-01-01"
END_DATE = "2023-11-28"
LONG_STREAK = 20
MEDIUM_STREAK = 15
SHORT_STREAK = 10

headers = {"user-agent": "kirillbobyrev/1.0.0"}


def get_player_games(player: str, year: str, month: str):
    r = requests.get(
        f"https://api.chess.com/pub/player/{player}/games/{year}/{month}",
        headers=headers,
    )
    games = r.json()["games"]
    rows = []
    for game in games:
        result = 0
        if game["white"]["result"] == "win":
            result = 1
        if game["black"]["result"] == "win":
            result = -1
        row = (
            game["end_time"],
            game["url"],
            result,
            game["white"]["username"],
            game["black"]["username"],
            game["white"]["rating"],
            game["black"]["rating"],
            game["time_control"],
            game["rated"],
            game["rules"],
            game["black"]["rating"]
            if game["white"]["username"] == PLAYER
            else game["white"]["rating"],
            game["black"]["rating"]
            if game["white"]["username"] != PLAYER
            else game["white"]["rating"],
        )
        rows.append(row)
    df = pd.DataFrame(
        rows,
        columns=[
            "end_time",
            "url",
            "result",
            "white",
            "black",
            "white_rating",
            "black_rating",
            "time_control",
            "rated",
            "rules",
            f"{PLAYER}_opponent_rating",
            f"{PLAYER}_rating",
        ],
    )
    return df.convert_dtypes()


# Get player's games archives.
df = pd.concat(
    [
        get_player_games(PLAYER, "2023", "11"),
        get_player_games(PLAYER, "2023", "10"),
        get_player_games(PLAYER, "2023", "09"),
        get_player_games(PLAYER, "2023", "08"),
        get_player_games(PLAYER, "2023", "07"),
        get_player_games(PLAYER, "2023", "06"),
        get_player_games(PLAYER, "2023", "05"),
        get_player_games(PLAYER, "2023", "04"),
        get_player_games(PLAYER, "2023", "03"),
        get_player_games(PLAYER, "2023", "02"),
        get_player_games(PLAYER, "2023", "01"),
    ]
)

# Filter games that are 3+0 blitz, rated, standard chess and between START_DATE
# and END_DATE.
print(f"Finding games from {START_DATE} to {END_DATE}")
df = (
    df[
        (df["time_control"] == "180")
        & (df["rules"] == "chess")
        & (df["end_time"] > datetime.strptime(START_DATE, "%Y-%m-%d").timestamp())
        & (df["end_time"] < datetime.strptime(END_DATE, "%Y-%m-%d").timestamp())
        & df["rated"]
    ]
    .set_index("end_time")
    .sort_index()
)

# Add a column that indicates whether the player won the game.
df[f"{PLAYER}_won"] = np.where(
    (
        ((df["white"] == PLAYER) & (df["result"] == 1))
        | ((df["black"] == PLAYER) & (df["result"] == -1))
    ),
    True,
    False,
)

# Print some statistics.
total_games = len(df)
print(f"{PLAYER} has played {total_games} filtered games in selected time period.")
won = df[f"{PLAYER}_won"].sum()
print(f"{PLAYER} won {won} out of these games.")
drew = df["result"].value_counts()[0]
print(f"{PLAYER} drew {drew} out of these games.")
print(
    f"{PLAYER} scored a total of {won + drew / 2} points out of {total_games} "
    "possible."
)

# Print some rating statistics.
mean_opponent_rating = df[f"{PLAYER}_opponent_rating"].mean()
print(f"The average opponent rating is {mean_opponent_rating:.2f}")
mean_player_rating = df[f"{PLAYER}_rating"].mean()
print(f"{PLAYER} average rating is {mean_player_rating:.2f}")

# Calculate opponent rating percentiles.
percentile_25 = df[f"{PLAYER}_opponent_rating"].quantile(0.25)
percentile_50 = df[f"{PLAYER}_opponent_rating"].quantile(0.50)
percentile_75 = df[f"{PLAYER}_opponent_rating"].quantile(0.75)
percentile_90 = df[f"{PLAYER}_opponent_rating"].quantile(0.90)

print(
    f"25%: {percentile_25:.2f}, 50%: {percentile_50:.2f}, 75%: {percentile_75:.2f}, 90%: {percentile_90:.2f}"
)

# Plot opponent rating distribution.
plot = df[f"{PLAYER}_opponent_rating"].plot.hist(bins=75, label="density", density=True)
plt.title(f"Opponent rating distribution for {PLAYER}")
plot.set_xlabel("Chess.com rating")
plot.set_ylabel("Number of times encountered")
plot.axvline(
    mean_opponent_rating,
    color="k",
    linestyle="dashed",
    linewidth=2,
    label=f"mean = {int(mean_opponent_rating)}",
)
plot.axvline(
    percentile_25,
    color="r",
    linestyle="dashed",
    linewidth=2,
    label=f"25% = {int(percentile_25)}",
)
plot.axvline(
    percentile_50,
    color="r",
    linestyle="dashed",
    linewidth=2,
    label=f"50% = {int(percentile_50)}",
)
plot.axvline(
    percentile_75,
    color="r",
    linestyle="dashed",
    linewidth=2,
    label=f"75% = {int(percentile_75)}",
)
plot.axvline(
    percentile_90,
    color="r",
    linestyle="dashed",
    linewidth=2,
    label=f"90% = {int(percentile_90)}",
)
plot.legend()
plt.xlim(2200, 3300)
fig = plot.get_figure()
fig.savefig(f"{PLAYER}_opponent_ratings.png")

# Find longest winning streaks.

streak = 0
ratings = []
player_starting_rating = None
first_url = None
streaks = []

for result, opponent_rating, player_rating, url in zip(
    df[f"{PLAYER}_won"],
    df[f"{PLAYER}_opponent_rating"],
    df[f"{PLAYER}_rating"],
    df["url"],
):
    if result:
        streak += 1
        ratings.append(opponent_rating)
        if player_starting_rating is None:
            player_starting_rating = player_rating
        if first_url is None:
            first_url = url
    else:
        if streak > 0:
            streaks.append(
                (streak, np.mean(ratings), player_starting_rating, first_url)
            )
        streak = 0
        ratings = []
        first_url = None
        player_starting_rating = None

if streak > 0:
    streaks.append((streak, np.mean(ratings), player_starting_rating, first_url))

TOP_K = 10
print(f"{PLAYER}'s top {TOP_K} winning streaks:")
for streak, average_opponent_rating, player_rating, url in sorted(
    streaks, reverse=True
)[:TOP_K]:
    print(
        f"{streak} games, average opponent rating "
        f"{average_opponent_rating:.2f}, starting {PLAYER} rating "
        f"{player_rating:.2f}, starting at {url}"
    )

streak_lengths = list(map(lambda item: item[0], streaks))
print("Streaks:")
print(sorted(streak_lengths, reverse=True)[:30])
short_streaks = Counter(i >= SHORT_STREAK for i in streak_lengths)[True]
print(f"Short streaks of {SHORT_STREAK}+ wins: {short_streaks}")
medium_streaks = Counter(i >= MEDIUM_STREAK for i in streak_lengths)[True]
print(f"Medium streaks of {MEDIUM_STREAK}+ wins: {medium_streaks}")
long_streaks = Counter(i >= LONG_STREAK for i in streak_lengths)[True]
print(f"Short streaks of {LONG_STREAK}+ wins: {long_streaks}")
trimmed_lengths = list(filter(lambda length: length >= 5, streak_lengths))
plt.clf()
plt.hist(trimmed_lengths, bins=20)
plt.xlim(5, 60)
plt.savefig("win_streaks_distribution.png")

## monte_carlo.cc
#include <cstdlib>
#include <iostream>
#include <random>

using namespace std;

constexpr int kNumSimulationRunsPerBatch = 100'000;
// Number of games the player played this year.
constexpr int kNumTrials = 908;
// Minimum length of of wins to count as a streak.
constexpr int kStreakMinLength = 32;
// Minimum number of streaks to count as a success.
constexpr int kMinStreaks = 1;
// Probability of winning a game calculated using Elo win probability formula.
constexpr double kWinProbability = .83;

default_random_engine generator;
uniform_real_distribution<double> distribution(0.0, 1.0);

// Returns whether or not the player won game.
bool sample_win() { return distribution(generator) < kWinProbability; }

// Returns whether or not the sequence of kNumTrials games contains kMinStreaks
// or more streaks of kMinStreakLength wins.
bool sample_sequence() {
  int streak = 0;
  int num_streaks = 0;
  for (int i = 0; i < kNumTrials; ++i) {
    if (sample_win()) {
      ++streak;
      continue;
    }
    if (streak >= kStreakMinLength) {
      ++num_streaks;
    }
    streak = 0;
  }
  if (streak >= kStreakMinLength) {
    ++num_streaks;
  }
  return num_streaks >= kMinStreaks;
}

int main() {
  srand(time(NULL));
  cout << "Starting Monte Carlo simulation." << endl;
  cout << "Number of simulations per batch: " << kNumSimulationRunsPerBatch
       << endl;
  cout << "Number of trials: " << kNumTrials << endl;
  cout << "Minimum streak length: " << kStreakMinLength << endl;
  cout << "Minimum number of streaks: " << kMinStreaks << endl;
  cout << "Win probability: " << kWinProbability << endl;
  long long int sequences = 0;
  long long int successes = 0;
  while (true) {
    for (int i = 0; i < kNumSimulationRunsPerBatch; ++i) {
      bool had_enough_streaks = sample_sequence();
      ++sequences;
      if (had_enough_streaks) {
        ++successes;
      }
    }
    cout << "Simulations: " << sequences << " with " << kMinStreaks
         << " or more " << kStreakMinLength
         << "-game win streaks: " << successes << " probability: "
         << static_cast<double>(successes) / static_cast<double>(sequences)
         << endl;
  }
}
	import requests
	import pandas as pd
	import numpy as np
	from datetime import datetime
	import matplotlib.pyplot as plt
	from collections import Counter

	PLAYER = "Hikaru"
	START_DATE = "2023-01-01"
	END_DATE = "2023-11-28"
	LONG_STREAK = 20
	MEDIUM_STREAK = 15
	SHORT_STREAK = 10

	headers = {"user-agent": "kirillbobyrev/1.0.0"}


	def get_player_games(player: str, year: str, month: str):
	r = requests.get(
	f"https://api.chess.com/pub/player/{player}/games/{year}/{month}",
	headers=headers,
	)
	games = r.json()["games"]
	rows = []
	for game in games:
	result = 0
	if game["white"]["result"] == "win":
	result = 1
	if game["black"]["result"] == "win":
	result = -1
	row = (
	game["end_time"],
	game["url"],
	result,
	game["white"]["username"],
	game["black"]["username"],
	game["white"]["rating"],
	game["black"]["rating"],
	game["time_control"],
	game["rated"],
	game["rules"],
	game["black"]["rating"]
	if game["white"]["username"] == PLAYER
	else game["white"]["rating"],
	game["black"]["rating"]
	if game["white"]["username"] != PLAYER
	else game["white"]["rating"],
	)
	rows.append(row)
	df = pd.DataFrame(
	rows,
	columns=[
	"end_time",
	"url",
	"result",
	"white",
	"black",
	"white_rating",
	"black_rating",
	"time_control",
	"rated",
	"rules",
	f"{PLAYER}_opponent_rating",
	f"{PLAYER}_rating",
	],
	)
	return df.convert_dtypes()


	# Get player's games archives.
	df = pd.concat(
	[
	get_player_games(PLAYER, "2023", "11"),
	get_player_games(PLAYER, "2023", "10"),
	get_player_games(PLAYER, "2023", "09"),
	get_player_games(PLAYER, "2023", "08"),
	get_player_games(PLAYER, "2023", "07"),
	get_player_games(PLAYER, "2023", "06"),
	get_player_games(PLAYER, "2023", "05"),
	get_player_games(PLAYER, "2023", "04"),
	get_player_games(PLAYER, "2023", "03"),
	get_player_games(PLAYER, "2023", "02"),
	get_player_games(PLAYER, "2023", "01"),
	]
	)

	# Filter games that are 3+0 blitz, rated, standard chess and between START_DATE
	# and END_DATE.
	print(f"Finding games from {START_DATE} to {END_DATE}")
	df = (
	df[
	(df["time_control"] == "180")
	& (df["rules"] == "chess")
	& (df["end_time"] > datetime.strptime(START_DATE, "%Y-%m-%d").timestamp())
	& (df["end_time"] < datetime.strptime(END_DATE, "%Y-%m-%d").timestamp())
	& df["rated"]
	]
	.set_index("end_time")
	.sort_index()
	)

	# Add a column that indicates whether the player won the game.
	df[f"{PLAYER}_won"] = np.where(
	(
	((df["white"] == PLAYER) & (df["result"] == 1))
	\| ((df["black"] == PLAYER) & (df["result"] == -1))
	),
	True,
	False,
	)

	# Print some statistics.
	total_games = len(df)
	print(f"{PLAYER} has played {total_games} filtered games in selected time period.")
	won = df[f"{PLAYER}_won"].sum()
	print(f"{PLAYER} won {won} out of these games.")
	drew = df["result"].value_counts()[0]
	print(f"{PLAYER} drew {drew} out of these games.")
	print(
	f"{PLAYER} scored a total of {won + drew / 2} points out of {total_games} "
	"possible."
	)

	# Print some rating statistics.
	mean_opponent_rating = df[f"{PLAYER}_opponent_rating"].mean()
	print(f"The average opponent rating is {mean_opponent_rating:.2f}")
	mean_player_rating = df[f"{PLAYER}_rating"].mean()
	print(f"{PLAYER} average rating is {mean_player_rating:.2f}")

	# Calculate opponent rating percentiles.
	percentile_25 = df[f"{PLAYER}_opponent_rating"].quantile(0.25)
	percentile_50 = df[f"{PLAYER}_opponent_rating"].quantile(0.50)
	percentile_75 = df[f"{PLAYER}_opponent_rating"].quantile(0.75)
	percentile_90 = df[f"{PLAYER}_opponent_rating"].quantile(0.90)

	print(
	f"25%: {percentile_25:.2f}, 50%: {percentile_50:.2f}, 75%: {percentile_75:.2f}, 90%: {percentile_90:.2f}"
	)

	# Plot opponent rating distribution.
	plot = df[f"{PLAYER}_opponent_rating"].plot.hist(bins=75, label="density", density=True)
	plt.title(f"Opponent rating distribution for {PLAYER}")
	plot.set_xlabel("Chess.com rating")
	plot.set_ylabel("Number of times encountered")
	plot.axvline(
	mean_opponent_rating,
	color="k",
	linestyle="dashed",
	linewidth=2,
	label=f"mean = {int(mean_opponent_rating)}",
	)
	plot.axvline(
	percentile_25,
	color="r",
	linestyle="dashed",
	linewidth=2,
	label=f"25% = {int(percentile_25)}",
	)
	plot.axvline(
	percentile_50,
	color="r",
	linestyle="dashed",
	linewidth=2,
	label=f"50% = {int(percentile_50)}",
	)
	plot.axvline(
	percentile_75,
	color="r",
	linestyle="dashed",
	linewidth=2,
	label=f"75% = {int(percentile_75)}",
	)
	plot.axvline(
	percentile_90,
	color="r",
	linestyle="dashed",
	linewidth=2,
	label=f"90% = {int(percentile_90)}",
	)
	plot.legend()
	plt.xlim(2200, 3300)
	fig = plot.get_figure()
	fig.savefig(f"{PLAYER}_opponent_ratings.png")

	# Find longest winning streaks.

	streak = 0
	ratings = []
	player_starting_rating = None
	first_url = None
	streaks = []

	for result, opponent_rating, player_rating, url in zip(
	df[f"{PLAYER}_won"],
	df[f"{PLAYER}_opponent_rating"],
	df[f"{PLAYER}_rating"],
	df["url"],
	):
	if result:
	streak += 1
	ratings.append(opponent_rating)
	if player_starting_rating is None:
	player_starting_rating = player_rating
	if first_url is None:
	first_url = url
	else:
	if streak > 0:
	streaks.append(
	(streak, np.mean(ratings), player_starting_rating, first_url)
	)
	streak = 0
	ratings = []
	first_url = None
	player_starting_rating = None

	if streak > 0:
	streaks.append((streak, np.mean(ratings), player_starting_rating, first_url))

	TOP_K = 10
	print(f"{PLAYER}'s top {TOP_K} winning streaks:")
	for streak, average_opponent_rating, player_rating, url in sorted(
	streaks, reverse=True
	)[:TOP_K]:
	print(
	f"{streak} games, average opponent rating "
	f"{average_opponent_rating:.2f}, starting {PLAYER} rating "
	f"{player_rating:.2f}, starting at {url}"
	)

	streak_lengths = list(map(lambda item: item[0], streaks))
	print("Streaks:")
	print(sorted(streak_lengths, reverse=True)[:30])
	short_streaks = Counter(i >= SHORT_STREAK for i in streak_lengths)[True]
	print(f"Short streaks of {SHORT_STREAK}+ wins: {short_streaks}")
	medium_streaks = Counter(i >= MEDIUM_STREAK for i in streak_lengths)[True]
	print(f"Medium streaks of {MEDIUM_STREAK}+ wins: {medium_streaks}")
	long_streaks = Counter(i >= LONG_STREAK for i in streak_lengths)[True]
	print(f"Short streaks of {LONG_STREAK}+ wins: {long_streaks}")
	trimmed_lengths = list(filter(lambda length: length >= 5, streak_lengths))
	plt.clf()
	plt.hist(trimmed_lengths, bins=20)
	plt.xlim(5, 60)
	plt.savefig("win_streaks_distribution.png")
	#include <cstdlib>
	#include <iostream>
	#include <random>

	using namespace std;

	constexpr int kNumSimulationRunsPerBatch = 100'000;
	// Number of games the player played this year.
	constexpr int kNumTrials = 908;
	// Minimum length of of wins to count as a streak.
	constexpr int kStreakMinLength = 32;
	// Minimum number of streaks to count as a success.
	constexpr int kMinStreaks = 1;
	// Probability of winning a game calculated using Elo win probability formula.
	constexpr double kWinProbability = .83;

	default_random_engine generator;
	uniform_real_distribution<double> distribution(0.0, 1.0);

	// Returns whether or not the player won game.
	bool sample_win() { return distribution(generator) < kWinProbability; }

	// Returns whether or not the sequence of kNumTrials games contains kMinStreaks
	// or more streaks of kMinStreakLength wins.
	bool sample_sequence() {
	int streak = 0;
	int num_streaks = 0;
	for (int i = 0; i < kNumTrials; ++i) {
	if (sample_win()) {
	++streak;
	continue;
	}
	if (streak >= kStreakMinLength) {
	++num_streaks;
	}
	streak = 0;
	}
	if (streak >= kStreakMinLength) {
	++num_streaks;
	}
	return num_streaks >= kMinStreaks;
	}

	int main() {
	srand(time(NULL));
	cout << "Starting Monte Carlo simulation." << endl;
	cout << "Number of simulations per batch: " << kNumSimulationRunsPerBatch
	<< endl;
	cout << "Number of trials: " << kNumTrials << endl;
	cout << "Minimum streak length: " << kStreakMinLength << endl;
	cout << "Minimum number of streaks: " << kMinStreaks << endl;
	cout << "Win probability: " << kWinProbability << endl;
	long long int sequences = 0;
	long long int successes = 0;
	while (true) {
	for (int i = 0; i < kNumSimulationRunsPerBatch; ++i) {
	bool had_enough_streaks = sample_sequence();
	++sequences;
	if (had_enough_streaks) {
	++successes;
	}
	}
	cout << "Simulations: " << sequences << " with " << kMinStreaks
	<< " or more " << kStreakMinLength
	<< "-game win streaks: " << successes << " probability: "
	<< static_cast<double>(successes) / static_cast<double>(sequences)
	<< endl;
	}
	}