Skip to content

Instantly share code, notes, and snippets.

@mrphilroth
Created August 12, 2016 01:31
Show Gist options
  • Save mrphilroth/aaf4bc875652979e592ccbaf0f3e954c to your computer and use it in GitHub Desktop.
Save mrphilroth/aaf4bc875652979e592ccbaf0f3e954c to your computer and use it in GitHub Desktop.
Scrape basketball data and process the play by play data
#!/usr/bin/python
import os
import sys
import time
import redis
import requests
import argparse
import datetime
import numpy as np
from bs4 import BeautifulSoup
years = list(range(2010, 2017))
rclient = redis.Redis(host="localhost")
# The number of seconds of game time at the start of each period
period_starts = {
1: 0, 2: 720, 3: 1440, 4: 2160,
5: 2880, 6: 3180, 7: 3480, 8: 3780, 9: 4080, 10: 4380
}
ddir = os.path.join(os.environ["HOME"], "data")
sportdict = {
"baseball": {
"dir": os.path.abspath(ddir + "/baseball"),
"url": "http://www.baseball-reference.com"
},
"basketball": {
"dir": os.path.abspath(ddir + "/basketball"),
"url": "http://www.basketball-reference.com"
},
"football": {
"dir": os.path.abspath(ddir + "/football"),
"url": "http://www.pro-football-reference.com"
}
}
def get_page(sport, ext, filename=None, force=False):
"""
Mirror a directory structure found at a base url in a local directory
"""
if filename is None:
filename = "{}/{}".format(sportdict[sport]["dir"], ext)
if not force and os.path.exists(filename):
return open(filename).read()
filedir = os.path.os.path.dirname(filename)
if not os.path.exists(filedir):
os.makedirs(filedir)
url = "{}/{}".format(sportdict[sport]["url"], ext)
print(url)
currdelay = 1
r = requests.get(url)
while r.status_code != 200:
time.sleep(currdelay)
r = requests.get(url)
currdelay += 1
html = r.text
open(filename, "w").write(html)
return html
def remove_page(sport, ext, filename=None):
"""
Remove a page that may have already been downloaded
"""
if filename is None:
filename = "{}/{}".format(sportdict[sport]["dir"], ext)
if os.path.exists(filename):
os.remove(filename)
def day_from_gamecode(gamecode):
"""
Get a day index so that games are added to redis sorted
"""
year = int(gamecode[:4])
month = int(gamecode[4:6])
day = int(gamecode[6:8])
return float(int(datetime.datetime(year, month, day).timestamp() / 86400))
def get_players(row):
"""
Return all the players referenced in a table row in tuples: (id, name)
"""
return [(a.get("href")[11:-5], a.text) for a in row.findAll("a")]
def get_gameclock(period, minute_str):
"""
Return the seconds elapsed so far given a period and clock state
"""
minutes, seconds = [float(s) for s in minute_str.split(":")]
if period > 4:
period_clock = 5 * 60 - (60 * minutes + seconds)
else:
period_clock = 12 * 60 - (60 * minutes + seconds)
return period_starts[period] + period_clock
def get_right_digits(s):
"""
Pick off characters from the right while they're integers. Used to grab
the scores.
"""
i = len(s) - 1
while s[i].isdigit():
i -= 1
return int(s[i + 1:])
month_map = {
"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
"05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
"09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"
}
def short_date(gamecode):
"""
Convert a gamecode into a descriptive date
"""
return month_map[gamecode[4:6]] + " " + gamecode[6:8]
def get_season(gamecode):
"""
Get the year of the season from the gamecode
"""
year = int(gamecode[:4])
month = int(gamecode[4:6])
if month > 7:
year += 1
return year
def process_boxscore(gamecode):
"""
Return a dict of the players that played in a game and their team
"""
pipe = rclient.pipeline()
pipe.zadd("gamecodes", gamecode, day_from_gamecode(gamecode))
url = "boxscores/{}.html".format(gamecode)
html = get_page("basketball", url)
soup = BeautifulSoup(html, "lxml")
span = soup.find("span", {"class": "bold_text large_text"})
header_table = list(span.parents)[5]
links = header_table.findAll("a")
team_links = [link for link in links if str(link).find("/teams/") > 0]
spans = header_table.findAll("span")
team_spans = [span for span in spans if str(span).find("/teams/") > 0]
away_team = team_links[0].get("href")[7:-5].replace("/", "")
away_score = get_right_digits(team_spans[0].text)
away_team_name = team_links[0].text
home_team = team_links[1].get("href")[7:-5].replace("/", "")
home_score = get_right_digits(team_spans[1].text)
home_team_name = team_links[1].text
pipe.hset(":".join([away_team, "info"]), "name", away_team_name)
pipe.hset(":".join([away_team, "info"]), "teamcode", away_team)
pipe.hset(":".join([home_team, "info"]), "name", home_team_name)
pipe.hset(":".join([home_team, "info"]), "teamcode", home_team)
pipe.hset(":".join([gamecode, "info"]), "gamecode", gamecode)
pipe.hset(":".join([gamecode, "info"]), "shortdate", short_date(gamecode))
pipe.hset(":".join([gamecode, "info"]), "hometeam", home_team)
pipe.hset(":".join([gamecode, "info"]), "hometeamname", home_team_name)
pipe.hset(":".join([gamecode, "info"]), "homescore", home_score)
pipe.hset(":".join([gamecode, "info"]), "awayteam", away_team)
pipe.hset(":".join([gamecode, "info"]), "awayteamname", away_team_name)
pipe.hset(":".join([gamecode, "info"]), "awayscore", away_score)
team_dict = {}
year = get_season(gamecode)
tables = soup.findAll("table", {"class": "sortable stats_table"})
for i, row in enumerate(tables[0].findAll("tr")):
for ptuple in get_players(row):
if len(row.contents) > 30:
pid = ptuple[0]
pname = ptuple[1]
team_dict[pid] = away_team
pipe.sadd(":".join([gamecode, away_team]), pid)
pipe.sadd("players", pid)
pipe.sadd(":".join([pid, str(year), "teams"]), away_team)
pipe.zadd(":".join([pid, "gamecodes"]),
gamecode, day_from_gamecode(gamecode))
pipe.sadd(":".join([away_team, "players"]), pid)
pipe.sadd(":".join([gamecode, away_team, "players"]), pid)
pipe.hset(":".join([pid, "info"]), "name", pname)
pipe.hset(":".join((pid, gamecode)), "name", pname)
pipe.hset(":".join((pid, gamecode)), "opp",
"@{}".format(home_team[:3]))
for i, row in enumerate(tables[2].findAll("tr")):
for ptuple in get_players(row):
if len(row.contents) > 30:
pid = ptuple[0]
pname = ptuple[1]
team_dict[pid] = home_team
pipe.sadd(":".join([gamecode, home_team]), pid)
pipe.sadd("players", pid)
pipe.sadd(":".join([pid, str(year), "teams"]), home_team)
pipe.zadd(":".join([pid, "gamecodes"]),
gamecode, day_from_gamecode(gamecode))
pipe.sadd(":".join([home_team, "players"]), pid)
pipe.sadd(":".join([gamecode, home_team, "players"]), pid)
pipe.hset(":".join([pid, "info"]), "name", pname)
pipe.hset(":".join((pid, gamecode)), "name", pname)
pipe.hset(":".join((pid, gamecode)),
"opp", away_team[:3] + " ")
pipe.execute()
home_dict = {home_team: "home", away_team: "away"}
return team_dict, home_dict
def clear_the_floor(onthefloor, substitutions, period):
"""
Remove everyone that's on the floor and add substitutions out at the end
of every period
"""
for team, players in onthefloor.items():
for player in players:
substitutions[player].append(("out", period_starts[period]))
onthefloor[team].clear()
def substitutions_to_rows(substitutions, gamecode, team_dict):
"""
Convert list of substitutions to a second by second histogram
"""
rows = []
all_seconds = np.arange(4680) + 1
for player, subs in substitutions.items():
if not len(subs) % 2 == 0:
bad_in_inds = [i for i, s in enumerate(subs)
if s[0] == "in" and i % 2 == 1]
subs.insert(bad_in_inds[0], ("out", subs[bad_in_inds[0] - 1][1]))
histogram = np.zeros(4680, dtype=np.uint8)
for i in range(0, len(subs) - 1, 2):
if subs[i][0] != "in":
print("MALFORMED SUBSTITUTION LIST!")
print(subs)
histogram[np.logical_and(
all_seconds >= subs[i][1], all_seconds <= subs[i + 1][1])] = 1
# TODO: add teamcode
rows.append({"histogram": histogram[:2880],
"player": player,
"team": team_dict[player],
"gamecode": gamecode})
return rows
def pack_game_histogram(histogram):
return np.packbits(histogram).tobytes()
def update_player_times(rows):
"""
"""
pipe = rclient.pipeline()
for row in rows:
key = ":".join((row["player"], row["gamecode"]))
pipe.hset(key, "hist", pack_game_histogram(row["histogram"]))
pipe.execute()
def subsample(hist):
return np.mean(hist.reshape(-1, 4), 1)
def pack_totals_histogram(total, histogram):
to_pack = np.hstack((np.array(total, dtype=np.float32), histogram))
return to_pack.astype(np.float32).tobytes()
def unpack_totals_histogram(buf):
redis_array = np.frombuffer(buf, dtype=np.float32)
total = redis_array[0]
hist = redis_array[1:]
return total, hist
def update_player_total(pid, year, teamcode, hist):
"""
"""
rkey = ":".join([pid, str(year), teamcode])
if rclient.exists(rkey):
ctotal, chist = unpack_totals_histogram(rclient.get(rkey))
chist = (chist * ctotal + subsample(hist)) / (ctotal + 1)
rclient.set(rkey, pack_totals_histogram(ctotal + 1, chist))
else:
rclient.set(rkey, pack_totals_histogram(1, subsample(hist)))
def update_player_totals(rows, year, team_dict):
"""
"""
for row in rows:
pid = row["player"]
teamcode = team_dict[pid]
hist = np.array(row["histogram"], dtype=np.int16)
update_player_total(pid, year, teamcode, hist)
def process_pbp(gamecode, debug=False):
"""
Monster function to process the entire play by play
"""
url = "boxscores/pbp/{}.html".format(gamecode)
html = get_page("basketball", url)
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class": "no_highlight stats_table"})
if table is None or len(table) < 2:
"""
This means the play by play is still not available for this game, so
remove the downloaded html and recheck on the next update.
"""
remove_page("basketball", url)
return
period = 0
cachedrows = []
currgameclock = 0
year = get_season(gamecode)
rows = table.findAll("tr")
team_dict, home_dict = process_boxscore(gamecode)
onthefloor = dict(zip(home_dict.keys(), [set(), set()]))
substitutions = dict([(player, []) for player in team_dict.keys()])
for row in rows:
# If this is a quarter end, go right to clearing the cached rows
gameclock = None
if not row.has_attr("id"):
ptuples = get_players(row)
if len(ptuples) == 0:
continue
gameclock = get_gameclock(period, row.contents[1].text)
if gameclock == currgameclock:
cachedrows.append(row)
continue
# Processing game events before substitutions that happened at the
# same time will lead to less errors
exited = set()
entered = set()
appeared = set()
ignore_appearance = set()
for crow in cachedrows:
players = [t[0] for t in get_players(crow)]
# If a player didn't make the box score but is in the play by
# play, he probably go zero minutes. The game is almost over
# and this substitution doesn't matter
if not all([player in team_dict.keys() for player in players]):
continue
if debug:
print(crow)
if crow.text.find("enters the game") > 0:
ignore_appearance.add(players[0])
if players[0] in exited:
exited.remove(players[0])
else:
entered.add(players[0])
if len(players) > 1:
ignore_appearance.add(players[1])
if players[1] in entered:
entered.remove(players[1])
else:
exited.add(players[1])
# Build up the active players from the beginning of each period
# from in game stats as they happen
# Bench technical players may not be on the floor
# Ejected means he's not on the floor anymore
if ((sum(map(len, onthefloor.values())) < 10) and
(crow.text.find("echnical foul by") < 0) and
(crow.text.find("ejected from game") < 0)):
appeared.update(players)
if debug:
print(currgameclock)
print(onthefloor)
print("appeared: " + str(appeared))
print("entered: " + str(entered))
print("exited: " + str(exited))
# If a player appeared during the same second that he was subbed, we
# can't trust the order of those things. Ignore the appearance and
# just trust the substitution
appeared = appeared.difference(ignore_appearance)
for player in appeared:
if player not in onthefloor[team_dict[player]]:
onthefloor[team_dict[player]].add(player)
substitutions[player].append(("in", period_starts[period]))
for player in exited:
if player in onthefloor[team_dict[player]]:
onthefloor[team_dict[player]].remove(player)
else:
substitutions[player].append(("in", period_starts[period]))
substitutions[player].append(("out", int(currgameclock)))
for player in entered:
onthefloor[team_dict[player]].add(player)
substitutions[player].append(("in", int(currgameclock)))
if debug:
print(onthefloor)
# Check the status of the players on the floor only after all the rows
# at the same clock time have been processed
if any(map(lambda s: len(s) > 5, onthefloor.values())):
print("TOO MANY PLAYERS ON THE FLOOR!")
print(gamecode)
sys.exit()
# If this is a quarter end, increment the period and clear the floor
if row.has_attr("id"):
period += 1
clear_the_floor(onthefloor, substitutions, period)
cachedrows = []
cachedrows.append(row)
currgameclock = gameclock
period += 1
clear_the_floor(onthefloor, substitutions, period)
rows = substitutions_to_rows(substitutions, gamecode, team_dict)
if rows is None:
print("Missing histograms")
print(gamecode)
sys.exit()
update_player_times(rows)
update_player_totals(rows, year, team_dict)
return rows
def get_gamecodes(year):
"""
Get all the gamecodes that have occured in a year
"""
url = "leagues/NBA_{}_games.html".format(year)
html = get_page("basketball", url, force=True)
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"id": "games"})
rows = table.findAll("tr")
gamecodes = []
for row in rows:
a = row.contents[5].find("a")
if a is not None:
gamecodes.append(a.get("href")[11:-5])
return gamecodes
def update_database():
"""
Find and process new games
"""
zrange = rclient.zrange("gamecodes", 0, -1)
processed = set([g.decode("utf-8") for g in zrange])
for year in years:
for gamecode in list(get_gamecodes(year)):
if gamecode not in processed:
print(gamecode)
process_pbp(gamecode)
def refresh_database():
"""
Find and process recent games
"""
zrange = rclient.zrange("gamecodes", 0, -1)
processed = set([g.decode("utf-8") for g in zrange])
for gamecode in list(get_gamecodes(years[-1])):
if gamecode not in processed:
print(gamecode)
process_pbp(gamecode)
def rebuild_database():
"""
Hard rebuild of the entire database
"""
rclient.flushdb()
for year in years:
for gamecode in list(get_gamecodes(year)):
print(gamecode)
process_pbp(gamecode)
def sync_files():
"""
Download html files from sports reference
"""
for year in years:
for gamecode in list(get_gamecodes(year)):
get_page("basketball", "boxscores/pbp/{}.html".format(gamecode))
get_page("basketball", "boxscores/{}.html".format(gamecode))
def main():
parser = argparse.ArgumentParser(prog="scrape")
parser.add_argument("task", metavar="TASK", type=str,
help="The task to complete. [update or rebuild]")
args = parser.parse_args()
if args.task.lower() == "update":
update_database()
if args.task.lower() == "refresh":
refresh_database()
elif args.task.lower() == "rebuild":
rebuild_database()
elif args.task.lower() == "sync":
sync_files()
else:
process_pbp(args.task, debug=True)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment