Skip to content

Instantly share code, notes, and snippets.

Created March 27, 2014 23:27
Show Gist options
  • Save nrjones8/9821474 to your computer and use it in GitHub Desktop.
Save nrjones8/9821474 to your computer and use it in GitHub Desktop.
import bs4 as bs
import requests as r
import re
import pandas as pd
SCORE_RE = r'[0-9]*\-[0-9]*$'
TIMESTAMP_RE = r'[0-9]*:[0-9]*$'
# Could be either a half (in NCAA) or quarter (in NBA)
END_OF_PERIOD_RE = r'End of'
def parse_time(time_str):
input: a string like <MINUTE>:<SECONDS>
minutes, seconds = time_str.split(':')
return float(minutes) + float(seconds) / 60
def convert_global_time(relative_time, half):
Returns the minute (in range 0 - 40) of the time represented
by <relative_time> in the <half>nd half
if half < 3:
since_start_of_half = 20.0 - relative_time
since_start_of_half = 5.0 - relative_time
global_time = 20.0 * (half - 1) + since_start_of_half
return global_time
def make_uniform_time_intervals(events, times):
Based on the game stored in <events>, make a new list of events (dictionaries)
with scores for each time in <times>
Useful for comparing games
new_events = []
event_index = 0
cur_event = events[event_index]
for t in times:
while t > cur_event['time']:
# Move to the next event, if possible
if event_index < len(events) - 1:
event_index += 1
cur_event = events[event_index]
# Otherwise we've gone through every event
# Copy cur_event, but set its time to be <t>
event = {k : v for k, v in cur_event.items()}
event['time'] = t
return new_events
def parse_team_names_and_rankings(soup):
Grab the team names from summary/linescore table
linescore_table = soup.find('table', {'class' : 'linescore'})
away_name, home_name = [t.text for t in linescore_table.find_all('a')]
rank_rows = soup.find_all('td', {'class' : 'teamRank'})
if rank_rows is None:
away_rank, home_rank = -1, -1
# First rank row has no text. Go figure.
away_rank, home_rank = [int(t.text.replace('#', '')) for t in rank_rows if t.text != '']
return away_name, away_rank, home_name, home_rank
def parse_game_urls(scoreboard_url=None):
Scrape <scoreboard_url> and extract any URLs that lead to Play-by-Plays.
Returns relative URLs (such as '/nba/playbyplay?gameId=400489876')
to be joined with the base ESPN url
if scoreboard_url is None:
scoreboard_url = ''
response = r.get(scoreboard_url)
html = response.text
soup = bs.BeautifulSoup(html)
all_links = soup.find_all('a')
game_urls = []
for link in all_links:
if 'Play‑By‑Play' in link.text:
return [ESPN_BASE_URL + u for u in game_urls]
def process_one_game(url, round_num, time_intervals=None):
Returns list of dictionaries storing events for game with play-by-play
URL given by <url>
print('Working on', url)
response = r.get(url)
html = response.text
soup = bs.BeautifulSoup(html)
away_name, away_rank, home_name, home_rank = parse_team_names_and_rankings(soup)
rank_diff = abs(away_rank - home_rank)
game_id = '%s-%s' % (away_name, home_name)
# Store which team has "better" rank, i.e. a lower number
home_higher_rank = home_rank < away_rank
period = 1
previous_away_score = 0
previous_home_score = 0
# A list of dictionaries
all_events = []
rows = soup.find_all('tr')
for row in rows:
# Only create an event on rows with scoring events
event_row = False
cols = row.find_all('td')
for col in cols:
if re.match(SCORE_RE, col.text):
away_score, home_score = [int(s) for s in col.text.split('-')]
# Only save an event if one of the scores changed
event_row = away_score != previous_away_score or home_score != previous_home_score
previous_away_score = away_score
previous_home_score = home_score
if re.match(TIMESTAMP_RE, col.text):
cur_time = parse_time(col.text)
if re.match(END_OF_PERIOD_RE, col.text):
period += 1
if event_row:
diff_score = home_score - away_score if home_higher_rank else away_score - home_score
global_time = convert_global_time(cur_time, period)
event = {
'game_id' : game_id,
'round_num' : round_num,
'away' : away_name,
'away_rank' : away_rank,
'home' : home_name,
'home_rank' : home_rank,
'time' : global_time,
'away_score' : away_score,
'home_score' : home_score,
'diff_score' : diff_score,
'rank_diff' : rank_diff
if time_intervals is None:
return all_events
return make_uniform_time_intervals(all_events, time_intervals)
def process_one_day(scoreboard_url, round_num, time_intervals=None):
Returns list of dictionaries containing events from all games linked to
by <scoreboard_url> i.e. all the games played on given day
game_urls = parse_game_urls(scoreboard_url)
# A list of dictionaries
all_games = []
for url in game_urls:
all_games += process_one_game(url, round_num, time_intervals)
return all_games
def process_tournament(outfile='data/tournament_pbp.csv', time_intervals=None):
march = 20140300
day_to_round = {
20 : 2,
21 : 2,
22 : 3,
23 : 3
# Looking at March 20 - 23 for the moment
days = list(range(20, 24))
dates = [march + d for d in days]
# List of dictionaries
all_games = []
for day in days:
day_url = NCAA_BASE_URL + '?date=' + str(march + day)
all_games += process_one_day(day_url, day_to_round[day], time_intervals)
df = pd.DataFrame(all_games)
df.to_csv(outfile, index=False)
if __name__ == '__main__':
# From 0 --> 40.75
times = [.25 * t for t in range(4 * 41)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment