Created August 12, 2016 01:31
Scrape basketball data and process the play by play data
import os
import sys
import time
import redis
import requests
import argparse
import datetime
import numpy as np
from bs4 import BeautifulSoup
years = list(range(2010, 2017))
rclient = redis.Redis(host="localhost")
# The number of seconds of game time at the start of each period
period_starts = {
1: 0, 2: 720, 3: 1440, 4: 2160,
5: 2880, 6: 3180, 7: 3480, 8: 3780, 9: 4080, 10: 4380
ddir = os.path.join(os.environ["HOME"], "data")
sportdict = {
"baseball": {
"dir": os.path.abspath(ddir + "/baseball"),
"url": ""
"basketball": {
"dir": os.path.abspath(ddir + "/basketball"),
"url": ""
"football": {
"dir": os.path.abspath(ddir + "/football"),
"url": ""
def get_page(sport, ext, filename=None, force=False):
Mirror a directory structure found at a base url in a local directory
if filename is None:
filename = "{}/{}".format(sportdict[sport]["dir"], ext)
if not force and os.path.exists(filename):
return open(filename).read()
filedir = os.path.os.path.dirname(filename)
if not os.path.exists(filedir):
url = "{}/{}".format(sportdict[sport]["url"], ext)
currdelay = 1
r = requests.get(url)
while r.status_code != 200:
r = requests.get(url)
currdelay += 1
html = r.text
open(filename, "w").write(html)
return html
def remove_page(sport, ext, filename=None):
Remove a page that may have already been downloaded
if filename is None:
filename = "{}/{}".format(sportdict[sport]["dir"], ext)
if os.path.exists(filename):
def day_from_gamecode(gamecode):
Get a day index so that games are added to redis sorted
year = int(gamecode[:4])
month = int(gamecode[4:6])
day = int(gamecode[6:8])
return float(int(datetime.datetime(year, month, day).timestamp() / 86400))
def get_players(row):
Return all the players referenced in a table row in tuples: (id, name)
return [(a.get("href")[11:-5], a.text) for a in row.findAll("a")]
def get_gameclock(period, minute_str):
Return the seconds elapsed so far given a period and clock state
minutes, seconds = [float(s) for s in minute_str.split(":")]
if period > 4:
period_clock = 5 * 60 - (60 * minutes + seconds)
period_clock = 12 * 60 - (60 * minutes + seconds)
return period_starts[period] + period_clock
def get_right_digits(s):
Pick off characters from the right while they're integers. Used to grab
the scores.
i = len(s) - 1
while s[i].isdigit():
i -= 1
return int(s[i + 1:])
month_map = {
"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
"05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
"09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"
def short_date(gamecode):
Convert a gamecode into a descriptive date
return month_map[gamecode[4:6]] + " " + gamecode[6:8]
def get_season(gamecode):
Get the year of the season from the gamecode
year = int(gamecode[:4])
month = int(gamecode[4:6])
if month > 7:
year += 1
return year
def process_boxscore(gamecode):
Return a dict of the players that played in a game and their team
pipe = rclient.pipeline()
pipe.zadd("gamecodes", gamecode, day_from_gamecode(gamecode))
url = "boxscores/{}.html".format(gamecode)
html = get_page("basketball", url)
soup = BeautifulSoup(html, "lxml")
span = soup.find("span", {"class": "bold_text large_text"})
header_table = list(span.parents)[5]
links = header_table.findAll("a")
team_links = [link for link in links if str(link).find("/teams/") > 0]
spans = header_table.findAll("span")
team_spans = [span for span in spans if str(span).find("/teams/") > 0]
away_team = team_links[0].get("href")[7:-5].replace("/", "")
away_score = get_right_digits(team_spans[0].text)
away_team_name = team_links[0].text
home_team = team_links[1].get("href")[7:-5].replace("/", "")
home_score = get_right_digits(team_spans[1].text)
home_team_name = team_links[1].text
pipe.hset(":".join([away_team, "info"]), "name", away_team_name)
pipe.hset(":".join([away_team, "info"]), "teamcode", away_team)
pipe.hset(":".join([home_team, "info"]), "name", home_team_name)
pipe.hset(":".join([home_team, "info"]), "teamcode", home_team)
pipe.hset(":".join([gamecode, "info"]), "gamecode", gamecode)
pipe.hset(":".join([gamecode, "info"]), "shortdate", short_date(gamecode))
pipe.hset(":".join([gamecode, "info"]), "hometeam", home_team)
pipe.hset(":".join([gamecode, "info"]), "hometeamname", home_team_name)
pipe.hset(":".join([gamecode, "info"]), "homescore", home_score)
pipe.hset(":".join([gamecode, "info"]), "awayteam", away_team)
pipe.hset(":".join([gamecode, "info"]), "awayteamname", away_team_name)
pipe.hset(":".join([gamecode, "info"]), "awayscore", away_score)
team_dict = {}
year = get_season(gamecode)
tables = soup.findAll("table", {"class": "sortable stats_table"})
for i, row in enumerate(tables[0].findAll("tr")):
for ptuple in get_players(row):
if len(row.contents) > 30:
pid = ptuple[0]
pname = ptuple[1]
team_dict[pid] = away_team
pipe.sadd(":".join([gamecode, away_team]), pid)
pipe.sadd("players", pid)
pipe.sadd(":".join([pid, str(year), "teams"]), away_team)
pipe.zadd(":".join([pid, "gamecodes"]),
gamecode, day_from_gamecode(gamecode))
pipe.sadd(":".join([away_team, "players"]), pid)
pipe.sadd(":".join([gamecode, away_team, "players"]), pid)
pipe.hset(":".join([pid, "info"]), "name", pname)
pipe.hset(":".join((pid, gamecode)), "name", pname)
pipe.hset(":".join((pid, gamecode)), "opp",
for i, row in enumerate(tables[2].findAll("tr")):
for ptuple in get_players(row):
if len(row.contents) > 30:
pid = ptuple[0]
pname = ptuple[1]
team_dict[pid] = home_team
pipe.sadd(":".join([gamecode, home_team]), pid)
pipe.sadd("players", pid)
pipe.sadd(":".join([pid, str(year), "teams"]), home_team)
pipe.zadd(":".join([pid, "gamecodes"]),
gamecode, day_from_gamecode(gamecode))
pipe.sadd(":".join([home_team, "players"]), pid)
pipe.sadd(":".join([gamecode, home_team, "players"]), pid)
pipe.hset(":".join([pid, "info"]), "name", pname)
pipe.hset(":".join((pid, gamecode)), "name", pname)
pipe.hset(":".join((pid, gamecode)),
"opp", away_team[:3] + " ")
home_dict = {home_team: "home", away_team: "away"}
return team_dict, home_dict
def clear_the_floor(onthefloor, substitutions, period):
Remove everyone that's on the floor and add substitutions out at the end
of every period
for team, players in onthefloor.items():
for player in players:
substitutions[player].append(("out", period_starts[period]))
def substitutions_to_rows(substitutions, gamecode, team_dict):
Convert list of substitutions to a second by second histogram
rows = []
all_seconds = np.arange(4680) + 1
for player, subs in substitutions.items():
if not len(subs) % 2 == 0:
bad_in_inds = [i for i, s in enumerate(subs)
if s[0] == "in" and i % 2 == 1]
subs.insert(bad_in_inds[0], ("out", subs[bad_in_inds[0] - 1][1]))
histogram = np.zeros(4680, dtype=np.uint8)
for i in range(0, len(subs) - 1, 2):
if subs[i][0] != "in":
all_seconds >= subs[i][1], all_seconds <= subs[i + 1][1])] = 1
# TODO: add teamcode
rows.append({"histogram": histogram[:2880],
"player": player,
"team": team_dict[player],
"gamecode": gamecode})
return rows
def pack_game_histogram(histogram):
return np.packbits(histogram).tobytes()
def update_player_times(rows):
pipe = rclient.pipeline()
for row in rows:
key = ":".join((row["player"], row["gamecode"]))
pipe.hset(key, "hist", pack_game_histogram(row["histogram"]))
def subsample(hist):
return np.mean(hist.reshape(-1, 4), 1)
def pack_totals_histogram(total, histogram):
to_pack = np.hstack((np.array(total, dtype=np.float32), histogram))
return to_pack.astype(np.float32).tobytes()
def unpack_totals_histogram(buf):
redis_array = np.frombuffer(buf, dtype=np.float32)
total = redis_array[0]
hist = redis_array[1:]
return total, hist
def update_player_total(pid, year, teamcode, hist):
rkey = ":".join([pid, str(year), teamcode])
if rclient.exists(rkey):
ctotal, chist = unpack_totals_histogram(rclient.get(rkey))
chist = (chist * ctotal + subsample(hist)) / (ctotal + 1)
rclient.set(rkey, pack_totals_histogram(ctotal + 1, chist))
rclient.set(rkey, pack_totals_histogram(1, subsample(hist)))
def update_player_totals(rows, year, team_dict):
for row in rows:
pid = row["player"]
teamcode = team_dict[pid]
hist = np.array(row["histogram"], dtype=np.int16)
update_player_total(pid, year, teamcode, hist)
def process_pbp(gamecode, debug=False):
Monster function to process the entire play by play
url = "boxscores/pbp/{}.html".format(gamecode)
html = get_page("basketball", url)
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class": "no_highlight stats_table"})
if table is None or len(table) < 2:
This means the play by play is still not available for this game, so
remove the downloaded html and recheck on the next update.
remove_page("basketball", url)
period = 0
cachedrows = []
currgameclock = 0
year = get_season(gamecode)
rows = table.findAll("tr")
team_dict, home_dict = process_boxscore(gamecode)
onthefloor = dict(zip(home_dict.keys(), [set(), set()]))
substitutions = dict([(player, []) for player in team_dict.keys()])
for row in rows:
# If this is a quarter end, go right to clearing the cached rows
gameclock = None
if not row.has_attr("id"):
ptuples = get_players(row)
if len(ptuples) == 0:
gameclock = get_gameclock(period, row.contents[1].text)
if gameclock == currgameclock:
# Processing game events before substitutions that happened at the
# same time will lead to less errors
exited = set()
entered = set()
appeared = set()
ignore_appearance = set()
for crow in cachedrows:
players = [t[0] for t in get_players(crow)]
# If a player didn't make the box score but is in the play by
# play, he probably go zero minutes. The game is almost over
# and this substitution doesn't matter
if not all([player in team_dict.keys() for player in players]):
if debug:
if crow.text.find("enters the game") > 0:
if players[0] in exited:
if len(players) > 1:
if players[1] in entered:
# Build up the active players from the beginning of each period
# from in game stats as they happen
# Bench technical players may not be on the floor
# Ejected means he's not on the floor anymore
if ((sum(map(len, onthefloor.values())) < 10) and
(crow.text.find("echnical foul by") < 0) and
(crow.text.find("ejected from game") < 0)):
if debug:
print("appeared: " + str(appeared))
print("entered: " + str(entered))
print("exited: " + str(exited))
# If a player appeared during the same second that he was subbed, we
# can't trust the order of those things. Ignore the appearance and
# just trust the substitution
appeared = appeared.difference(ignore_appearance)
for player in appeared:
if player not in onthefloor[team_dict[player]]:
substitutions[player].append(("in", period_starts[period]))
for player in exited:
if player in onthefloor[team_dict[player]]:
substitutions[player].append(("in", period_starts[period]))
substitutions[player].append(("out", int(currgameclock)))
for player in entered:
substitutions[player].append(("in", int(currgameclock)))
if debug:
# Check the status of the players on the floor only after all the rows
# at the same clock time have been processed
if any(map(lambda s: len(s) > 5, onthefloor.values())):
# If this is a quarter end, increment the period and clear the floor
if row.has_attr("id"):
period += 1
clear_the_floor(onthefloor, substitutions, period)
cachedrows = []
currgameclock = gameclock
period += 1
clear_the_floor(onthefloor, substitutions, period)
rows = substitutions_to_rows(substitutions, gamecode, team_dict)
if rows is None:
print("Missing histograms")
update_player_totals(rows, year, team_dict)
return rows
def get_gamecodes(year):
Get all the gamecodes that have occured in a year
url = "leagues/NBA_{}_games.html".format(year)
html = get_page("basketball", url, force=True)
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"id": "games"})
rows = table.findAll("tr")
gamecodes = []
for row in rows:
a = row.contents[5].find("a")
if a is not None:
return gamecodes
def update_database():
Find and process new games
zrange = rclient.zrange("gamecodes", 0, -1)
processed = set([g.decode("utf-8") for g in zrange])
for year in years:
for gamecode in list(get_gamecodes(year)):
if gamecode not in processed:
def refresh_database():
Find and process recent games
zrange = rclient.zrange("gamecodes", 0, -1)
processed = set([g.decode("utf-8") for g in zrange])
for gamecode in list(get_gamecodes(years[-1])):
if gamecode not in processed:
def rebuild_database():
Hard rebuild of the entire database
for year in years:
for gamecode in list(get_gamecodes(year)):
def sync_files():
Download html files from sports reference
for year in years:
for gamecode in list(get_gamecodes(year)):
get_page("basketball", "boxscores/pbp/{}.html".format(gamecode))
get_page("basketball", "boxscores/{}.html".format(gamecode))
def main():
parser = argparse.ArgumentParser(prog="scrape")
parser.add_argument("task", metavar="TASK", type=str,
help="The task to complete. [update or rebuild]")
args = parser.parse_args()
if args.task.lower() == "update":
if args.task.lower() == "refresh":
elif args.task.lower() == "rebuild":
elif args.task.lower() == "sync":
process_pbp(args.task, debug=True)
if __name__ == "__main__":
