Skip to content

Instantly share code, notes, and snippets.

@zzstoatzz
Last active May 25, 2022 15:27
Show Gist options
  • Save zzstoatzz/5d012f141a3d15e5f080d80c363af829 to your computer and use it in GitHub Desktop.
Save zzstoatzz/5d012f141a3d15e5f080d80c363af829 to your computer and use it in GitHub Desktop.
from chessdotcom import get_player_game_archives
from io import StringIO
from typing import List, Tuple
import boto3, chess.pgn as pgn, pandas as pd, requests
# https://python-chess.readthedocs.io/en/latest/pgn.html
class Game(pgn.Game):
def __init__(self: object, pgn_str: str):
self.game_obj = pgn.read_game(StringIO(pgn_str)).__dict__
self.game_obj['variations'] = str(self.game_obj['variations'][0])
self.df = pd.json_normalize(dict(self.game_obj['headers']))
self.df['pgn'] = self.game_obj['variations']
def get_games(url: str) -> List[Game]:
print(f"GET {url}")
return [Game(game['pgn']) for game in requests.get(url).json()['games']]
def load_games(games: Tuple[Game], base_path: str) -> None:
df = pd.concat([game.df for game in games])
year, month, _ = list(df['Date'])[0].split('.')
print(f'storing games from month {month} of year {year}...')
filepath = f'{base_path}/games/{year}_{month}.parquet.gzip'
df.to_parquet(filepath, compression='gzip')
def orca(filepath: str) -> None:
for username in ['n80n8']:
print(f"Checking for games on chess.com from: {username}")
# list of URLs to GET months of games from
archive_urls = get_player_game_archives(username=username).archives
new_user_games = [get_games(url) for url in archive_urls]
if len(new_user_games) == 0:
print(f'No new months of games to load for {username}!')
continue
print(f'Fetching {len(new_user_games) } new months of games from {username}..')
for month in new_user_games:
load_games(
games=month,
base_path=filepath
)
if __name__ == "__main__":
orca('s3://nate-all-purpose-bucket')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment