Skip to content

Instantly share code, notes, and snippets.

@zzstoatzz
Last active June 8, 2023 22:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zzstoatzz/01fad7fe4db4e649527f7f98c6984b4a to your computer and use it in GitHub Desktop.
Save zzstoatzz/01fad7fe4db4e649527f7f98c6984b4a to your computer and use it in GitHub Desktop.
from chessdotcom import get_player_game_archives
from io import StringIO
from prefect import flow, task, get_run_logger
from typing import List, Tuple
import boto3, chess.pgn as pgn, pandas as pd, requests
# https://python-chess.readthedocs.io/en/latest/pgn.html
class Game(pgn.Game):
def __init__(self: object, pgn_str: str):
self.game_obj = pgn.read_game(StringIO(pgn_str)).__dict__
self.game_obj['variations'] = str(self.game_obj['variations'][0])
self.df = pd.json_normalize(dict(self.game_obj['headers']))
self.df['pgn'] = self.game_obj['variations']
@task
def get_games(url: str) -> List[Game]:
logger = get_run_logger()
logger.info(f"GET {url}")
return [Game(game['pgn']) for game in requests.get(url).json()['games']]
@task
def load_games(games: Tuple[Game], base_path: str) -> None:
logger = get_run_logger()
df = pd.concat([game.df for game in games])
year, month, _ = list(df['Date'])[0].split('.')
logger.info(f'storing games from month {month} of year {year}...')
filepath = f'{base_path}/games/{year}_{month}.parquet.gzip'
df.to_parquet(filepath, compression='gzip')
@flow
def orca(filepath: str) -> None:
logger = get_run_logger()
for username in ['n80n8']:
logger.info(f"Checking for games on chess.com from: {username}")
# list of URLs to GET months of games from
archive_urls = get_player_game_archives(username=username).archives
new_user_games = [get_games(url) for url in archive_urls]
if len(new_user_games) == 0:
logger.info(f'No new months of games to load for {username}!')
continue
logger.info(f'Fetching {len(new_user_games) } new months of games from {username}..')
for month in new_user_games:
load_games(
games=month,
base_path=filepath
)
if __name__ == "__main__":
orca('s3://nate-all-purpose-bucket')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment