mrphilroth/scrape.py

## scrape.py
#!/usr/bin/python

import os
import sys
import time
import redis
import requests
import argparse
import datetime
import numpy as np
from bs4 import BeautifulSoup

years = list(range(2010, 2017))
rclient = redis.Redis(host="localhost")

# The number of seconds of game time at the start of each period
period_starts = {
    1: 0, 2: 720, 3: 1440, 4: 2160,
    5: 2880, 6: 3180, 7: 3480, 8: 3780, 9: 4080, 10: 4380
}

ddir = os.path.join(os.environ["HOME"], "data")
sportdict = {
    "baseball": {
        "dir": os.path.abspath(ddir + "/baseball"),
        "url": "http://www.baseball-reference.com"
    },
    "basketball": {
        "dir": os.path.abspath(ddir + "/basketball"),
        "url": "http://www.basketball-reference.com"
    },
    "football": {
        "dir": os.path.abspath(ddir + "/football"),
        "url": "http://www.pro-football-reference.com"
    }
}


def get_page(sport, ext, filename=None, force=False):
    """
    Mirror a directory structure found at a base url in a local directory
    """
    if filename is None:
        filename = "{}/{}".format(sportdict[sport]["dir"], ext)

    if not force and os.path.exists(filename):
        return open(filename).read()

    filedir = os.path.os.path.dirname(filename)
    if not os.path.exists(filedir):
        os.makedirs(filedir)

    url = "{}/{}".format(sportdict[sport]["url"], ext)
    print(url)

    currdelay = 1
    r = requests.get(url)
    while r.status_code != 200:
        time.sleep(currdelay)
        r = requests.get(url)
        currdelay += 1

    html = r.text
    open(filename, "w").write(html)

    return html


def remove_page(sport, ext, filename=None):
    """
    Remove a page that may have already been downloaded
    """
    if filename is None:
        filename = "{}/{}".format(sportdict[sport]["dir"], ext)

    if os.path.exists(filename):
        os.remove(filename)


def day_from_gamecode(gamecode):
    """
    Get a day index so that games are added to redis sorted
    """
    year = int(gamecode[:4])
    month = int(gamecode[4:6])
    day = int(gamecode[6:8])
    return float(int(datetime.datetime(year, month, day).timestamp() / 86400))


def get_players(row):
    """
    Return all the players referenced in a table row in tuples: (id, name)
    """
    return [(a.get("href")[11:-5], a.text) for a in row.findAll("a")]


def get_gameclock(period, minute_str):
    """
    Return the seconds elapsed so far given a period and clock state
    """
    minutes, seconds = [float(s) for s in minute_str.split(":")]
    if period > 4:
        period_clock = 5 * 60 - (60 * minutes + seconds)
    else:
        period_clock = 12 * 60 - (60 * minutes + seconds)

    return period_starts[period] + period_clock


def get_right_digits(s):
    """
    Pick off characters from the right while they're integers. Used to grab
    the scores.
    """
    i = len(s) - 1
    while s[i].isdigit():
        i -= 1
    return int(s[i + 1:])

month_map = {
    "01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
    "05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
    "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"
}


def short_date(gamecode):
    """
    Convert a gamecode into a descriptive date
    """
    return month_map[gamecode[4:6]] + " " + gamecode[6:8]


def get_season(gamecode):
    """
    Get the year of the season from the gamecode
    """
    year = int(gamecode[:4])
    month = int(gamecode[4:6])
    if month > 7:
        year += 1
    return year


def process_boxscore(gamecode):
    """
    Return a dict of the players that played in a game and their team
    """
    pipe = rclient.pipeline()

    pipe.zadd("gamecodes", gamecode, day_from_gamecode(gamecode))

    url = "boxscores/{}.html".format(gamecode)
    html = get_page("basketball", url)
    soup = BeautifulSoup(html, "lxml")

    span = soup.find("span", {"class": "bold_text large_text"})
    header_table = list(span.parents)[5]
    links = header_table.findAll("a")
    team_links = [link for link in links if str(link).find("/teams/") > 0]
    spans = header_table.findAll("span")
    team_spans = [span for span in spans if str(span).find("/teams/") > 0]

    away_team = team_links[0].get("href")[7:-5].replace("/", "")
    away_score = get_right_digits(team_spans[0].text)
    away_team_name = team_links[0].text
    home_team = team_links[1].get("href")[7:-5].replace("/", "")
    home_score = get_right_digits(team_spans[1].text)
    home_team_name = team_links[1].text

    pipe.hset(":".join([away_team, "info"]), "name", away_team_name)
    pipe.hset(":".join([away_team, "info"]), "teamcode", away_team)
    pipe.hset(":".join([home_team, "info"]), "name", home_team_name)
    pipe.hset(":".join([home_team, "info"]), "teamcode", home_team)

    pipe.hset(":".join([gamecode, "info"]), "gamecode", gamecode)
    pipe.hset(":".join([gamecode, "info"]), "shortdate", short_date(gamecode))
    pipe.hset(":".join([gamecode, "info"]), "hometeam", home_team)
    pipe.hset(":".join([gamecode, "info"]), "hometeamname", home_team_name)
    pipe.hset(":".join([gamecode, "info"]), "homescore", home_score)
    pipe.hset(":".join([gamecode, "info"]), "awayteam", away_team)
    pipe.hset(":".join([gamecode, "info"]), "awayteamname", away_team_name)
    pipe.hset(":".join([gamecode, "info"]), "awayscore", away_score)

    team_dict = {}
    year = get_season(gamecode)
    tables = soup.findAll("table", {"class": "sortable   stats_table"})
    for i, row in enumerate(tables[0].findAll("tr")):
        for ptuple in get_players(row):
            if len(row.contents) > 30:
                pid = ptuple[0]
                pname = ptuple[1]
                team_dict[pid] = away_team
                pipe.sadd(":".join([gamecode, away_team]), pid)
                pipe.sadd("players", pid)
                pipe.sadd(":".join([pid, str(year), "teams"]), away_team)
                pipe.zadd(":".join([pid, "gamecodes"]),
                          gamecode, day_from_gamecode(gamecode))
                pipe.sadd(":".join([away_team, "players"]), pid)
                pipe.sadd(":".join([gamecode, away_team, "players"]), pid)
                pipe.hset(":".join([pid, "info"]), "name", pname)
                pipe.hset(":".join((pid, gamecode)), "name", pname)
                pipe.hset(":".join((pid, gamecode)), "opp",
                          "@{}".format(home_team[:3]))

    for i, row in enumerate(tables[2].findAll("tr")):
        for ptuple in get_players(row):
            if len(row.contents) > 30:
                pid = ptuple[0]
                pname = ptuple[1]
                team_dict[pid] = home_team
                pipe.sadd(":".join([gamecode, home_team]), pid)
                pipe.sadd("players", pid)
                pipe.sadd(":".join([pid, str(year), "teams"]), home_team)
                pipe.zadd(":".join([pid, "gamecodes"]),
                          gamecode, day_from_gamecode(gamecode))
                pipe.sadd(":".join([home_team, "players"]), pid)
                pipe.sadd(":".join([gamecode, home_team, "players"]), pid)
                pipe.hset(":".join([pid, "info"]), "name", pname)
                pipe.hset(":".join((pid, gamecode)), "name", pname)
                pipe.hset(":".join((pid, gamecode)),
                          "opp", away_team[:3] + " ")

    pipe.execute()

    home_dict = {home_team: "home", away_team: "away"}
    return team_dict, home_dict


def clear_the_floor(onthefloor, substitutions, period):
    """
    Remove everyone that's on the floor and add substitutions out at the end
    of every period
    """
    for team, players in onthefloor.items():
        for player in players:
            substitutions[player].append(("out", period_starts[period]))
        onthefloor[team].clear()


def substitutions_to_rows(substitutions, gamecode, team_dict):
    """
    Convert list of substitutions to a second by second histogram
    """
    rows = []
    all_seconds = np.arange(4680) + 1
    for player, subs in substitutions.items():
        if not len(subs) % 2 == 0:
            bad_in_inds = [i for i, s in enumerate(subs)
                           if s[0] == "in" and i % 2 == 1]
            subs.insert(bad_in_inds[0], ("out", subs[bad_in_inds[0] - 1][1]))

        histogram = np.zeros(4680, dtype=np.uint8)
        for i in range(0, len(subs) - 1, 2):
            if subs[i][0] != "in":
                print("MALFORMED SUBSTITUTION LIST!")
                print(subs)
            histogram[np.logical_and(
                all_seconds >= subs[i][1], all_seconds <= subs[i + 1][1])] = 1

        # TODO: add teamcode
        rows.append({"histogram": histogram[:2880],
                     "player": player,
                     "team": team_dict[player],
                     "gamecode": gamecode})
    return rows


def pack_game_histogram(histogram):
    return np.packbits(histogram).tobytes()


def update_player_times(rows):
    """
    """
    pipe = rclient.pipeline()
    for row in rows:
        key = ":".join((row["player"], row["gamecode"]))
        pipe.hset(key, "hist", pack_game_histogram(row["histogram"]))

    pipe.execute()


def subsample(hist):
    return np.mean(hist.reshape(-1, 4), 1)


def pack_totals_histogram(total, histogram):
    to_pack = np.hstack((np.array(total, dtype=np.float32), histogram))
    return to_pack.astype(np.float32).tobytes()


def unpack_totals_histogram(buf):
    redis_array = np.frombuffer(buf, dtype=np.float32)
    total = redis_array[0]
    hist = redis_array[1:]
    return total, hist


def update_player_total(pid, year, teamcode, hist):
    """
    """
    rkey = ":".join([pid, str(year), teamcode])
    if rclient.exists(rkey):
        ctotal, chist = unpack_totals_histogram(rclient.get(rkey))
        chist = (chist * ctotal + subsample(hist)) / (ctotal + 1)
        rclient.set(rkey, pack_totals_histogram(ctotal + 1, chist))

    else:
        rclient.set(rkey, pack_totals_histogram(1, subsample(hist)))


def update_player_totals(rows, year, team_dict):
    """
    """
    for row in rows:
        pid = row["player"]
        teamcode = team_dict[pid]
        hist = np.array(row["histogram"], dtype=np.int16)
        update_player_total(pid, year, teamcode, hist)


def process_pbp(gamecode, debug=False):
    """
    Monster function to process the entire play by play
    """
    url = "boxscores/pbp/{}.html".format(gamecode)
    html = get_page("basketball", url)
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table", {"class": "no_highlight stats_table"})
    if table is None or len(table) < 2:
        """
        This means the play by play is still not available for this game, so
        remove the downloaded html and recheck on the next update.
        """
        remove_page("basketball", url)
        return

    period = 0
    cachedrows = []
    currgameclock = 0
    year = get_season(gamecode)
    rows = table.findAll("tr")
    team_dict, home_dict = process_boxscore(gamecode)
    onthefloor = dict(zip(home_dict.keys(), [set(), set()]))
    substitutions = dict([(player, []) for player in team_dict.keys()])
    for row in rows:

        # If this is a quarter end, go right to clearing the cached rows
        gameclock = None
        if not row.has_attr("id"):
            ptuples = get_players(row)
            if len(ptuples) == 0:
                continue

            gameclock = get_gameclock(period, row.contents[1].text)
            if gameclock == currgameclock:
                cachedrows.append(row)
                continue

        # Processing game events before substitutions that happened at the
        #  same time will lead to less errors
        exited = set()
        entered = set()
        appeared = set()
        ignore_appearance = set()
        for crow in cachedrows:
            players = [t[0] for t in get_players(crow)]

            # If a player didn't make the box score but is in the play by
            #  play, he probably go zero minutes. The game is almost over
            #  and this substitution doesn't matter
            if not all([player in team_dict.keys() for player in players]):
                continue

            if debug:
                print(crow)

            if crow.text.find("enters the game") > 0:
                ignore_appearance.add(players[0])

                if players[0] in exited:
                    exited.remove(players[0])
                else:
                    entered.add(players[0])

                if len(players) > 1:
                    ignore_appearance.add(players[1])
                    if players[1] in entered:
                        entered.remove(players[1])
                    else:
                        exited.add(players[1])

            # Build up the active players from the beginning of each period
            #  from in game stats as they happen
            #  Bench technical players may not be on the floor
            #  Ejected means he's not on the floor anymore
            if ((sum(map(len, onthefloor.values())) < 10) and
                    (crow.text.find("echnical foul by") < 0) and
                    (crow.text.find("ejected from game") < 0)):
                appeared.update(players)

        if debug:
            print(currgameclock)
            print(onthefloor)
            print("appeared: " + str(appeared))
            print("entered: " + str(entered))
            print("exited: " + str(exited))

        # If a player appeared during the same second that he was subbed, we
        #  can't trust the order of those things. Ignore the appearance and
        #  just trust the substitution
        appeared = appeared.difference(ignore_appearance)
        for player in appeared:
            if player not in onthefloor[team_dict[player]]:
                onthefloor[team_dict[player]].add(player)
                substitutions[player].append(("in", period_starts[period]))

        for player in exited:
            if player in onthefloor[team_dict[player]]:
                onthefloor[team_dict[player]].remove(player)
            else:
                substitutions[player].append(("in", period_starts[period]))
            substitutions[player].append(("out", int(currgameclock)))

        for player in entered:
            onthefloor[team_dict[player]].add(player)
            substitutions[player].append(("in", int(currgameclock)))

        if debug:
            print(onthefloor)

        # Check the status of the players on the floor only after all the rows
        #  at the same clock time have been processed
        if any(map(lambda s: len(s) > 5, onthefloor.values())):
            print("TOO MANY PLAYERS ON THE FLOOR!")
            print(gamecode)
            sys.exit()

        # If this is a quarter end, increment the period and clear the floor
        if row.has_attr("id"):
            period += 1
            clear_the_floor(onthefloor, substitutions, period)

        cachedrows = []
        cachedrows.append(row)
        currgameclock = gameclock

    period += 1
    clear_the_floor(onthefloor, substitutions, period)

    rows = substitutions_to_rows(substitutions, gamecode, team_dict)

    if rows is None:
        print("Missing histograms")
        print(gamecode)
        sys.exit()

    update_player_times(rows)
    update_player_totals(rows, year, team_dict)

    return rows


def get_gamecodes(year):
    """
    Get all the gamecodes that have occured in a year
    """
    url = "leagues/NBA_{}_games.html".format(year)
    html = get_page("basketball", url, force=True)
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table", {"id": "games"})
    rows = table.findAll("tr")

    gamecodes = []
    for row in rows:
        a = row.contents[5].find("a")
        if a is not None:
            gamecodes.append(a.get("href")[11:-5])

    return gamecodes


def update_database():
    """
    Find and process new games
    """
    zrange = rclient.zrange("gamecodes", 0, -1)
    processed = set([g.decode("utf-8") for g in zrange])
    for year in years:
        for gamecode in list(get_gamecodes(year)):
            if gamecode not in processed:
                print(gamecode)
                process_pbp(gamecode)


def refresh_database():
    """
    Find and process recent games
    """
    zrange = rclient.zrange("gamecodes", 0, -1)
    processed = set([g.decode("utf-8") for g in zrange])
    for gamecode in list(get_gamecodes(years[-1])):
        if gamecode not in processed:
            print(gamecode)
            process_pbp(gamecode)


def rebuild_database():
    """
    Hard rebuild of the entire database
    """
    rclient.flushdb()
    for year in years:
        for gamecode in list(get_gamecodes(year)):
            print(gamecode)
            process_pbp(gamecode)


def sync_files():
    """
    Download html files from sports reference
    """
    for year in years:
        for gamecode in list(get_gamecodes(year)):
            get_page("basketball", "boxscores/pbp/{}.html".format(gamecode))
            get_page("basketball", "boxscores/{}.html".format(gamecode))


def main():
    parser = argparse.ArgumentParser(prog="scrape")
    parser.add_argument("task", metavar="TASK", type=str,
                        help="The task to complete. [update or rebuild]")
    args = parser.parse_args()

    if args.task.lower() == "update":
        update_database()

    if args.task.lower() == "refresh":
        refresh_database()

    elif args.task.lower() == "rebuild":
        rebuild_database()

    elif args.task.lower() == "sync":
        sync_files()

    else:
        process_pbp(args.task, debug=True)


if __name__ == "__main__":
    main()
	#!/usr/bin/python

	import os
	import sys
	import time
	import redis
	import requests
	import argparse
	import datetime
	import numpy as np
	from bs4 import BeautifulSoup

	years = list(range(2010, 2017))
	rclient = redis.Redis(host="localhost")

	# The number of seconds of game time at the start of each period
	period_starts = {
	1: 0, 2: 720, 3: 1440, 4: 2160,
	5: 2880, 6: 3180, 7: 3480, 8: 3780, 9: 4080, 10: 4380
	}

	ddir = os.path.join(os.environ["HOME"], "data")
	sportdict = {
	"baseball": {
	"dir": os.path.abspath(ddir + "/baseball"),
	"url": "http://www.baseball-reference.com"
	},
	"basketball": {
	"dir": os.path.abspath(ddir + "/basketball"),
	"url": "http://www.basketball-reference.com"
	},
	"football": {
	"dir": os.path.abspath(ddir + "/football"),
	"url": "http://www.pro-football-reference.com"
	}
	}


	def get_page(sport, ext, filename=None, force=False):
	"""
	Mirror a directory structure found at a base url in a local directory
	"""
	if filename is None:
	filename = "{}/{}".format(sportdict[sport]["dir"], ext)

	if not force and os.path.exists(filename):
	return open(filename).read()

	filedir = os.path.os.path.dirname(filename)
	if not os.path.exists(filedir):
	os.makedirs(filedir)

	url = "{}/{}".format(sportdict[sport]["url"], ext)
	print(url)

	currdelay = 1
	r = requests.get(url)
	while r.status_code != 200:
	time.sleep(currdelay)
	r = requests.get(url)
	currdelay += 1

	html = r.text
	open(filename, "w").write(html)

	return html


	def remove_page(sport, ext, filename=None):
	"""
	Remove a page that may have already been downloaded
	"""
	if filename is None:
	filename = "{}/{}".format(sportdict[sport]["dir"], ext)

	if os.path.exists(filename):
	os.remove(filename)


	def day_from_gamecode(gamecode):
	"""
	Get a day index so that games are added to redis sorted
	"""
	year = int(gamecode[:4])
	month = int(gamecode[4:6])
	day = int(gamecode[6:8])
	return float(int(datetime.datetime(year, month, day).timestamp() / 86400))


	def get_players(row):
	"""
	Return all the players referenced in a table row in tuples: (id, name)
	"""
	return [(a.get("href")[11:-5], a.text) for a in row.findAll("a")]


	def get_gameclock(period, minute_str):
	"""
	Return the seconds elapsed so far given a period and clock state
	"""
	minutes, seconds = [float(s) for s in minute_str.split(":")]
	if period > 4:
	period_clock = 5 * 60 - (60 * minutes + seconds)
	else:
	period_clock = 12 * 60 - (60 * minutes + seconds)

	return period_starts[period] + period_clock


	def get_right_digits(s):
	"""
	Pick off characters from the right while they're integers. Used to grab
	the scores.
	"""
	i = len(s) - 1
	while s[i].isdigit():
	i -= 1
	return int(s[i + 1:])

	month_map = {
	"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
	"05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
	"09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"
	}


	def short_date(gamecode):
	"""
	Convert a gamecode into a descriptive date
	"""
	return month_map[gamecode[4:6]] + " " + gamecode[6:8]


	def get_season(gamecode):
	"""
	Get the year of the season from the gamecode
	"""
	year = int(gamecode[:4])
	month = int(gamecode[4:6])
	if month > 7:
	year += 1
	return year


	def process_boxscore(gamecode):
	"""
	Return a dict of the players that played in a game and their team
	"""
	pipe = rclient.pipeline()

	pipe.zadd("gamecodes", gamecode, day_from_gamecode(gamecode))

	url = "boxscores/{}.html".format(gamecode)
	html = get_page("basketball", url)
	soup = BeautifulSoup(html, "lxml")

	span = soup.find("span", {"class": "bold_text large_text"})
	header_table = list(span.parents)[5]
	links = header_table.findAll("a")
	team_links = [link for link in links if str(link).find("/teams/") > 0]
	spans = header_table.findAll("span")
	team_spans = [span for span in spans if str(span).find("/teams/") > 0]

	away_team = team_links[0].get("href")[7:-5].replace("/", "")
	away_score = get_right_digits(team_spans[0].text)
	away_team_name = team_links[0].text
	home_team = team_links[1].get("href")[7:-5].replace("/", "")
	home_score = get_right_digits(team_spans[1].text)
	home_team_name = team_links[1].text

	pipe.hset(":".join([away_team, "info"]), "name", away_team_name)
	pipe.hset(":".join([away_team, "info"]), "teamcode", away_team)
	pipe.hset(":".join([home_team, "info"]), "name", home_team_name)
	pipe.hset(":".join([home_team, "info"]), "teamcode", home_team)

	pipe.hset(":".join([gamecode, "info"]), "gamecode", gamecode)
	pipe.hset(":".join([gamecode, "info"]), "shortdate", short_date(gamecode))
	pipe.hset(":".join([gamecode, "info"]), "hometeam", home_team)
	pipe.hset(":".join([gamecode, "info"]), "hometeamname", home_team_name)
	pipe.hset(":".join([gamecode, "info"]), "homescore", home_score)
	pipe.hset(":".join([gamecode, "info"]), "awayteam", away_team)
	pipe.hset(":".join([gamecode, "info"]), "awayteamname", away_team_name)
	pipe.hset(":".join([gamecode, "info"]), "awayscore", away_score)

	team_dict = {}
	year = get_season(gamecode)
	tables = soup.findAll("table", {"class": "sortable stats_table"})
	for i, row in enumerate(tables[0].findAll("tr")):
	for ptuple in get_players(row):
	if len(row.contents) > 30:
	pid = ptuple[0]
	pname = ptuple[1]
	team_dict[pid] = away_team
	pipe.sadd(":".join([gamecode, away_team]), pid)
	pipe.sadd("players", pid)
	pipe.sadd(":".join([pid, str(year), "teams"]), away_team)
	pipe.zadd(":".join([pid, "gamecodes"]),
	gamecode, day_from_gamecode(gamecode))
	pipe.sadd(":".join([away_team, "players"]), pid)
	pipe.sadd(":".join([gamecode, away_team, "players"]), pid)
	pipe.hset(":".join([pid, "info"]), "name", pname)
	pipe.hset(":".join((pid, gamecode)), "name", pname)
	pipe.hset(":".join((pid, gamecode)), "opp",
	"@{}".format(home_team[:3]))

	for i, row in enumerate(tables[2].findAll("tr")):
	for ptuple in get_players(row):
	if len(row.contents) > 30:
	pid = ptuple[0]
	pname = ptuple[1]
	team_dict[pid] = home_team
	pipe.sadd(":".join([gamecode, home_team]), pid)
	pipe.sadd("players", pid)
	pipe.sadd(":".join([pid, str(year), "teams"]), home_team)
	pipe.zadd(":".join([pid, "gamecodes"]),
	gamecode, day_from_gamecode(gamecode))
	pipe.sadd(":".join([home_team, "players"]), pid)
	pipe.sadd(":".join([gamecode, home_team, "players"]), pid)
	pipe.hset(":".join([pid, "info"]), "name", pname)
	pipe.hset(":".join((pid, gamecode)), "name", pname)
	pipe.hset(":".join((pid, gamecode)),
	"opp", away_team[:3] + " ")

	pipe.execute()

	home_dict = {home_team: "home", away_team: "away"}
	return team_dict, home_dict


	def clear_the_floor(onthefloor, substitutions, period):
	"""
	Remove everyone that's on the floor and add substitutions out at the end
	of every period
	"""
	for team, players in onthefloor.items():
	for player in players:
	substitutions[player].append(("out", period_starts[period]))
	onthefloor[team].clear()


	def substitutions_to_rows(substitutions, gamecode, team_dict):
	"""
	Convert list of substitutions to a second by second histogram
	"""
	rows = []
	all_seconds = np.arange(4680) + 1
	for player, subs in substitutions.items():
	if not len(subs) % 2 == 0:
	bad_in_inds = [i for i, s in enumerate(subs)
	if s[0] == "in" and i % 2 == 1]
	subs.insert(bad_in_inds[0], ("out", subs[bad_in_inds[0] - 1][1]))

	histogram = np.zeros(4680, dtype=np.uint8)
	for i in range(0, len(subs) - 1, 2):
	if subs[i][0] != "in":
	print("MALFORMED SUBSTITUTION LIST!")
	print(subs)
	histogram[np.logical_and(
	all_seconds >= subs[i][1], all_seconds <= subs[i + 1][1])] = 1

	# TODO: add teamcode
	rows.append({"histogram": histogram[:2880],
	"player": player,
	"team": team_dict[player],
	"gamecode": gamecode})
	return rows


	def pack_game_histogram(histogram):
	return np.packbits(histogram).tobytes()


	def update_player_times(rows):
	"""
	"""
	pipe = rclient.pipeline()
	for row in rows:
	key = ":".join((row["player"], row["gamecode"]))
	pipe.hset(key, "hist", pack_game_histogram(row["histogram"]))

	pipe.execute()


	def subsample(hist):
	return np.mean(hist.reshape(-1, 4), 1)


	def pack_totals_histogram(total, histogram):
	to_pack = np.hstack((np.array(total, dtype=np.float32), histogram))
	return to_pack.astype(np.float32).tobytes()


	def unpack_totals_histogram(buf):
	redis_array = np.frombuffer(buf, dtype=np.float32)
	total = redis_array[0]
	hist = redis_array[1:]
	return total, hist


	def update_player_total(pid, year, teamcode, hist):
	"""
	"""
	rkey = ":".join([pid, str(year), teamcode])
	if rclient.exists(rkey):
	ctotal, chist = unpack_totals_histogram(rclient.get(rkey))
	chist = (chist * ctotal + subsample(hist)) / (ctotal + 1)
	rclient.set(rkey, pack_totals_histogram(ctotal + 1, chist))

	else:
	rclient.set(rkey, pack_totals_histogram(1, subsample(hist)))


	def update_player_totals(rows, year, team_dict):
	"""
	"""
	for row in rows:
	pid = row["player"]
	teamcode = team_dict[pid]
	hist = np.array(row["histogram"], dtype=np.int16)
	update_player_total(pid, year, teamcode, hist)


	def process_pbp(gamecode, debug=False):
	"""
	Monster function to process the entire play by play
	"""
	url = "boxscores/pbp/{}.html".format(gamecode)
	html = get_page("basketball", url)
	soup = BeautifulSoup(html, "lxml")
	table = soup.find("table", {"class": "no_highlight stats_table"})
	if table is None or len(table) < 2:
	"""
	This means the play by play is still not available for this game, so
	remove the downloaded html and recheck on the next update.
	"""
	remove_page("basketball", url)
	return

	period = 0
	cachedrows = []
	currgameclock = 0
	year = get_season(gamecode)
	rows = table.findAll("tr")
	team_dict, home_dict = process_boxscore(gamecode)
	onthefloor = dict(zip(home_dict.keys(), [set(), set()]))
	substitutions = dict([(player, []) for player in team_dict.keys()])
	for row in rows:

	# If this is a quarter end, go right to clearing the cached rows
	gameclock = None
	if not row.has_attr("id"):
	ptuples = get_players(row)
	if len(ptuples) == 0:
	continue

	gameclock = get_gameclock(period, row.contents[1].text)
	if gameclock == currgameclock:
	cachedrows.append(row)
	continue

	# Processing game events before substitutions that happened at the
	# same time will lead to less errors
	exited = set()
	entered = set()
	appeared = set()
	ignore_appearance = set()
	for crow in cachedrows:
	players = [t[0] for t in get_players(crow)]

	# If a player didn't make the box score but is in the play by
	# play, he probably go zero minutes. The game is almost over
	# and this substitution doesn't matter
	if not all([player in team_dict.keys() for player in players]):
	continue

	if debug:
	print(crow)

	if crow.text.find("enters the game") > 0:
	ignore_appearance.add(players[0])

	if players[0] in exited:
	exited.remove(players[0])
	else:
	entered.add(players[0])

	if len(players) > 1:
	ignore_appearance.add(players[1])
	if players[1] in entered:
	entered.remove(players[1])
	else:
	exited.add(players[1])

	# Build up the active players from the beginning of each period
	# from in game stats as they happen
	# Bench technical players may not be on the floor
	# Ejected means he's not on the floor anymore
	if ((sum(map(len, onthefloor.values())) < 10) and
	(crow.text.find("echnical foul by") < 0) and
	(crow.text.find("ejected from game") < 0)):
	appeared.update(players)

	if debug:
	print(currgameclock)
	print(onthefloor)
	print("appeared: " + str(appeared))
	print("entered: " + str(entered))
	print("exited: " + str(exited))

	# If a player appeared during the same second that he was subbed, we
	# can't trust the order of those things. Ignore the appearance and
	# just trust the substitution
	appeared = appeared.difference(ignore_appearance)
	for player in appeared:
	if player not in onthefloor[team_dict[player]]:
	onthefloor[team_dict[player]].add(player)
	substitutions[player].append(("in", period_starts[period]))

	for player in exited:
	if player in onthefloor[team_dict[player]]:
	onthefloor[team_dict[player]].remove(player)
	else:
	substitutions[player].append(("in", period_starts[period]))
	substitutions[player].append(("out", int(currgameclock)))

	for player in entered:
	onthefloor[team_dict[player]].add(player)
	substitutions[player].append(("in", int(currgameclock)))

	if debug:
	print(onthefloor)

	# Check the status of the players on the floor only after all the rows
	# at the same clock time have been processed
	if any(map(lambda s: len(s) > 5, onthefloor.values())):
	print("TOO MANY PLAYERS ON THE FLOOR!")
	print(gamecode)
	sys.exit()

	# If this is a quarter end, increment the period and clear the floor
	if row.has_attr("id"):
	period += 1
	clear_the_floor(onthefloor, substitutions, period)

	cachedrows = []
	cachedrows.append(row)
	currgameclock = gameclock

	period += 1
	clear_the_floor(onthefloor, substitutions, period)

	rows = substitutions_to_rows(substitutions, gamecode, team_dict)

	if rows is None:
	print("Missing histograms")
	print(gamecode)
	sys.exit()

	update_player_times(rows)
	update_player_totals(rows, year, team_dict)

	return rows


	def get_gamecodes(year):
	"""
	Get all the gamecodes that have occured in a year
	"""
	url = "leagues/NBA_{}_games.html".format(year)
	html = get_page("basketball", url, force=True)
	soup = BeautifulSoup(html, "lxml")
	table = soup.find("table", {"id": "games"})
	rows = table.findAll("tr")

	gamecodes = []
	for row in rows:
	a = row.contents[5].find("a")
	if a is not None:
	gamecodes.append(a.get("href")[11:-5])

	return gamecodes


	def update_database():
	"""
	Find and process new games
	"""
	zrange = rclient.zrange("gamecodes", 0, -1)
	processed = set([g.decode("utf-8") for g in zrange])
	for year in years:
	for gamecode in list(get_gamecodes(year)):
	if gamecode not in processed:
	print(gamecode)
	process_pbp(gamecode)


	def refresh_database():
	"""
	Find and process recent games
	"""
	zrange = rclient.zrange("gamecodes", 0, -1)
	processed = set([g.decode("utf-8") for g in zrange])
	for gamecode in list(get_gamecodes(years[-1])):
	if gamecode not in processed:
	print(gamecode)
	process_pbp(gamecode)


	def rebuild_database():
	"""
	Hard rebuild of the entire database
	"""
	rclient.flushdb()
	for year in years:
	for gamecode in list(get_gamecodes(year)):
	print(gamecode)
	process_pbp(gamecode)


	def sync_files():
	"""
	Download html files from sports reference
	"""
	for year in years:
	for gamecode in list(get_gamecodes(year)):
	get_page("basketball", "boxscores/pbp/{}.html".format(gamecode))
	get_page("basketball", "boxscores/{}.html".format(gamecode))


	def main():
	parser = argparse.ArgumentParser(prog="scrape")
	parser.add_argument("task", metavar="TASK", type=str,
	help="The task to complete. [update or rebuild]")
	args = parser.parse_args()

	if args.task.lower() == "update":
	update_database()

	if args.task.lower() == "refresh":
	refresh_database()

	elif args.task.lower() == "rebuild":
	rebuild_database()

	elif args.task.lower() == "sync":
	sync_files()

	else:
	process_pbp(args.task, debug=True)


	if __name__ == "__main__":
	main()