Skip to content

Instantly share code, notes, and snippets.

@beefy
Last active December 29, 2020 03:56
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beefy/d6cdb809115ecf46d5fea5a458c4dada to your computer and use it in GitHub Desktop.
Save beefy/d6cdb809115ecf46d5fea5a458c4dada to your computer and use it in GitHub Desktop.
Scrapes all live chess game PGNs from chess.com and concatenates the resulting files
#!/usr/bin/python
import spynner
import pyquery
import urllib
import os
# author: Nate Schultz
# contact: github.com/beefy
# created: 10/22/16
# This script downloads all live game PGNs from chess.com and concatinates the resulting files
# Please note you need your chess.com login credentials in the file path below
# Please note this may be broken in the future due to chess.com UI changes
# first line username
# second line password
LOGIN_CRED_FILE_PATH = './chesslogin.txt'
# path to output directory
# autocreated if it doesn't exist
OUTPUT_PATH = './my_chess_games'
if not os.path.exists(OUTPUT_PATH):
os.makedirs(OUTPUT_PATH)
url_old_game_archive = "https://www.chess.com/home/my_archive"
url_old_game_archive_all_live = "https://www.chess.com/home/my_archive?sortby=&show=live&color=all&result=all"
url_new_game_archive = "https://www.chess.com/games/archive"
next_page = lambda page_num: url_old_game_archive_all_live+"&page="+str(page_num)
# initialize browser
def init_browser():
b = spynner.Browser(debug_level=spynner.INFO)
b.load("https://www.chess.com/login")
b.load_jquery(True)
return b
# login from new chess.com
def login_new(b):
b.click_link('a[href="//www.chess.com/switch?request_uri=%2Flogin"]')
credentials = [line.strip() for line in open(LOGIN_CRED_FILE_PATH,'rb')]
b.wk_fill('input[id="username"]',credentials[0])
b.wk_fill('input[id="password"]',credentials[1])
b.click_link('button[id="login"]')
# login from old chess.com
def login_old(b):
credentials = [line.strip() for line in open(LOGIN_CRED_FILE_PATH,'rb')]
b.wk_fill('input[name="c1"]',credentials[0])
b.wk_fill('input[name="loginpassword"]',credentials[1])
b.click_link('button[name="btnLogin"]')
# load ajax
def load_ajax(b):
js_str = 'var script = document.createElement("script");script.src = "http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js";document.getElementsByTagName("head")[0].appendChild(script);'
b.runjs(js_str)
b.wait(5) # let ajax finish
# "depricated"
def check_checkboxes(b):
# first attempt: I tried to do the typical checkboxes + download button on the new chess.com
# but the download button proved too difficult, so I had to use the old chess.com
# this code checks the first 10 checkboxes, as a test
# index checkboxes
js_str = "i = 0; $('input[game-checkbox]').each( function() { $(this).attr('id','game-checkbox'+i); i++; });"
b.runjs(js_str)
# iterate checkboxes
for i in range(10):
b.click('input[id="game-checkbox'+str(i)+'"]')
b.click('.pull-right.download-all')
def scrape(b):
links_per_page = 50
chess_game_extension = '.chessgame'
chess_page_extension = '.chesspage'
game_delimiter = '\nwww.thenateschultz.com\ngithub.com/beefy\n'
# page iteration
page_i = 1;
while True:
# index view links
js_str = "i = 0; jQuery('.games.right-4').each( function() { jQuery(this).attr('id','game-view'+i); i++; });"
b.runjs(js_str)
# game iteration (same # every page)
for i in range(links_per_page):
b.click_link('#game-view'+str(i)) # click 'view' link
# download PGN
d = pyquery.PyQuery(b.html)
raw_href = d('a[class="bpgn"]').attr("href") # get download link
href = urllib.unquote(raw_href)
b.download(href, open(OUTPUT_PATH+'/PGN_'+str(i)+chess_game_extension,'w')) # write PGN to file
b.load(url_old_game_archive_all_live) # redirect to game archive
# re-index view links
js_str = "i = 0; jQuery('.games.right-4').each( function() { jQuery(this).attr('id','game-view'+i); i++; });"
b.runjs(js_str)
# merge PGNs
PGNs = [open(os.path.join(OUTPUT_PATH, file),'rb') for file in os.listdir(OUTPUT_PATH) if file.endswith(chess_game_extension)]
PGN_data = [file.read() for file in PGNs]
PGN_page_str = game_delimiter.join(PGN_data)
PGN_page_out = open(OUTPUT_PATH+'/page'+str(page_i)+chess_page_extension,'w')
PGN_page_out.write(PGN_page_str)
PGN_page_out.close()
for file in PGNs:
os.remove(file.name) # delete merged files
try:
# redirect to next page!
page_i += 1
b.load(next_page(page_i))
except:
# I guess there's no more pages
# merge pages
PGNs = [open(os.path.join(OUTPUT_PATH, file),'rb') for file in os.listdir(OUTPUT_PATH) if file.endswith(chess_page_extension)]
PGN_data = [file.read() for file in PGNs]
PGN_page_str = game_delimiter.join(PGN_data)
PGN_page_out = open(OUTPUT_PATH+'/PGNs.txt','w')
PGN_page_out.write(PGN_page_str)
PGN_page_out.close()
for file in PGNs:
os.remove(file.name) # delete merged files
return
if __name__ == "__main__":
b = init_browser()
login_old(b)
b.load(url_old_game_archive_all_live) # redirect to game archive
load_ajax(b)
scrape(b)
# b.browse() # activate GUI
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment