zzstoatzz/pgn_etl_pythonic.py

## pgn_etl_pythonic.py
from chessdotcom import get_player_game_archives
from io import StringIO
from typing import List, Tuple
import boto3, chess.pgn as pgn, pandas as pd, requests

# https://python-chess.readthedocs.io/en/latest/pgn.html
class Game(pgn.Game):
    def __init__(self: object, pgn_str: str):
        self.game_obj = pgn.read_game(StringIO(pgn_str)).__dict__
        self.game_obj['variations'] = str(self.game_obj['variations'][0])
        self.df = pd.json_normalize(dict(self.game_obj['headers']))
        self.df['pgn'] = self.game_obj['variations']

def get_games(url: str) -> List[Game]:
    print(f"GET {url}")
    return [Game(game['pgn']) for game in requests.get(url).json()['games']]

def load_games(games: Tuple[Game], base_path: str) -> None:
    df = pd.concat([game.df for game in games])
    year, month, _ = list(df['Date'])[0].split('.')
    print(f'storing games from month {month} of year {year}...')
    filepath = f'{base_path}/games/{year}_{month}.parquet.gzip'
    df.to_parquet(filepath, compression='gzip')

def orca(filepath: str) -> None:
    for username in ['n80n8']:
        print(f"Checking for games on chess.com from: {username}")

        # list of URLs to GET months of games from
        archive_urls = get_player_game_archives(username=username).archives

        new_user_games = [get_games(url) for url in archive_urls]

        if len(new_user_games) == 0:
            print(f'No new months of games to load for {username}!')
            continue

        print(f'Fetching {len(new_user_games) } new months of games from {username}..')

        for month in new_user_games:
            load_games(
                games=month,
                base_path=filepath
            )


if __name__ == "__main__":
    orca('s3://nate-all-purpose-bucket')
	from chessdotcom import get_player_game_archives
	from io import StringIO
	from typing import List, Tuple
	import boto3, chess.pgn as pgn, pandas as pd, requests

	# https://python-chess.readthedocs.io/en/latest/pgn.html
	class Game(pgn.Game):
	def __init__(self: object, pgn_str: str):
	self.game_obj = pgn.read_game(StringIO(pgn_str)).__dict__
	self.game_obj['variations'] = str(self.game_obj['variations'][0])
	self.df = pd.json_normalize(dict(self.game_obj['headers']))
	self.df['pgn'] = self.game_obj['variations']

	def get_games(url: str) -> List[Game]:
	print(f"GET {url}")
	return [Game(game['pgn']) for game in requests.get(url).json()['games']]

	def load_games(games: Tuple[Game], base_path: str) -> None:
	df = pd.concat([game.df for game in games])
	year, month, _ = list(df['Date'])[0].split('.')
	print(f'storing games from month {month} of year {year}...')
	filepath = f'{base_path}/games/{year}_{month}.parquet.gzip'
	df.to_parquet(filepath, compression='gzip')

	def orca(filepath: str) -> None:
	for username in ['n80n8']:
	print(f"Checking for games on chess.com from: {username}")

	# list of URLs to GET months of games from
	archive_urls = get_player_game_archives(username=username).archives

	new_user_games = [get_games(url) for url in archive_urls]

	if len(new_user_games) == 0:
	print(f'No new months of games to load for {username}!')
	continue

	print(f'Fetching {len(new_user_games) } new months of games from {username}..')

	for month in new_user_games:
	load_games(
	games=month,
	base_path=filepath
	)


	if __name__ == "__main__":
	orca('s3://nate-all-purpose-bucket')