Skip to content

Instantly share code, notes, and snippets.

@linwoodc3
Last active February 25, 2017 21:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linwoodc3/55d1cf413089560f157030b94746844f to your computer and use it in GitHub Desktop.
Save linwoodc3/55d1cf413089560f157030b94746844f to your computer and use it in GitHub Desktop.
Utility functions to clean ESPN.com's NBA data.
# By: Linwood Creekmore
# https://www.linkedin.com/in/linwood-creekmore-iii-a2174538/
from dateutil.parser import parse
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import datetime
import re
import numpy as np
import pandas as pd
import pysal as ps
def datecheck(row,year):
"""Function to convert different datetime columns into datetime object
Uses basic rule to create 2016 and 2017 datetime objects based on knowledge
Parameters
------------
row : pandas row
row of pandas dataframe to perform the operations
year: int
Integer representing the desired year
Returns
------------
Series ; pandas series
Series with datetime objects
"""
if datetime.datetime.now().year == parse(str(year)).year:
year1=year-1
year2=year
else:
year1 = year-1
year2=year
try:
date = str(row['Date']+' {0} '.format(year1)+ row['Time'])
try:
converted = parse(row['Date']+' {0} '.format(year1)+ row['Time'])
except:
try:
converted = parse(row['Date']+' {0} '.format(year2)+ row['Time'])
except:
pass
if converted < parse('Oct 01 {0}'.format(year1)):
try:
return parse(row['Date'] + " {0} ".format(year2)+ row['Time'])
except:
print(row)
else:
return converted
except:
print(date)
pass
def tierclean(tier):
""" Reverses the tier scoring; makes 5 equal to 1
"""
if tier == 5:
return 1
if tier == 4:
return 2
if tier==3:
return 3
if tier == 2:
return 4
if tier == 1:
return 5
def singlename(p):
"""Extracts single team city name from full team
"""
test = p.split()
if "Portland" in test or len(test)==2:
return test[0]
else:
return " ".join(test[:2])
def stringreplace(string):
"""Get rid of the 'at' in the schedule"""
return re.sub('at ',"",string).strip()
def schedulecleaner(year,team):
"""Functions to clean the ESPN schedule of NBA teams
and output as pandas dataframe
Parameters
-----------
year : int
Integer representing the year of the desired schedule
team : str
Trigram representing the three letter abbreviation of the NBA team
Returns
-----------
dataframe : pandas dataframe
Dataframe with data for NBA schedule of desired team
and desired year
"""
base = "http://www.espn.com/nba/teams/printSchedule?team={0}&season={1}".\
format(team,str(year))
schedule = pd.read_html(base)
schedule = schedule[0].iloc[2:,:3].reset_index(drop=True)
schedule.columns = ['Date','Team','Time']
schedule = schedule[schedule['Date']!="Date"]
dater = partial(datecheck,year=year)
schedule = schedule[['Team']].assign(Date=schedule.apply(dater,axis=1),\
home_status=np.where(schedule.Team.str.contains('at '),"A","H"),\
Team=schedule.Team.apply(stringreplace))
return schedule
def checker(team,frame):
"""Used to normalize the chance of home games."""
normalizer=frame.sum().max()
# return (np.all(teams[team].values==2))
return ((np.sum(frame[team].values)/normalizer))
def standingscleaner(year):
"""Function to pull and clean NBA standings data from ESPN."""
standings = pd.read_html('http://www.espn.com/nba/standings/_/season/{0}'.format(year))
# create columns for eastern/western conference teams
eastern = standings[0].assign(conference='Eastern')
western = standings[1].assign(conference='Western')
# concatenate the different conference tables
nba_stand_df = (pd.concat([eastern,western]).reset_index(drop=True)).assign(seasonYear=year)
# clean text column with team name and abbreviation
# regex to clean text in standings columns
r = re.compile('[A-Z]{2,}')
s = re.compile('[A-Z]{1,3}.*[ ]?([A-Z]{1})?([0-9]+)?[a-z]+')
"""
big cleaning job here. We use the regex above to create new columns
with the Team's name spelled out, the 3 letter all-caps abbreviation,
the city name, and finally the team's tier based on it's win/loss pct.
column.
"""
clean_nba_stand = nba_stand_df.iloc[:,1:].assign(\
teamName=nba_stand_df.iloc[:,0]\
.apply(lambda x: s.search(x).group() if\
s.search(x) else np.nan ),\
team_abbrev = nba_stand_df.iloc[:,0]\
.apply(lambda x: r.search(x).group() if \
r.search(x) else np.nan),\
cityName=nba_stand_df.iloc[:,0]\
.apply(lambda x: s.search(x).group() if\
s.search(x) else np.nan ).apply(singlename),\
Tier=pd.Series((ps.Fisher_Jenks(nba_stand_df.PCT,k=5)).yb+1)\
.apply(tierclean))
# Now we reorder the columns
clean_nba_stand = clean_nba_stand[clean_nba_stand.columns[-2:].tolist()+\
clean_nba_stand.columns[-6:-2].tolist()+\
clean_nba_stand.columns[:-6].tolist()]
replacements = {
'teamName': {
r'(Seattle SuperSonics)': 'Oklahoma City Thunder',
r'New Jersey Nets': 'Brooklyn Nets',
r'Los Angeles Clippers':'LA Clippers',
r'Charlotte Bobcats':'Charlotte Hornets',
r'New Orleans Hornets':'New Orleans Pelicans',
r'NO/Oklahoma City Hornets':'New Orleans Pelicans'},
'team_abbrev':{
r'SEA':'OKC',
r'NJ':'BKN',
r'LAC':'LA'
},
'cityName':{
r'Seattle':'Oklahoma City',
r'New Jersey':'Brooklyn',
r'NO/Oklahoma City':'New Orleans',
r'Oklahoma$':'Oklahoma City',
}
}
clean_nba_stand.replace(replacements, regex=True,inplace=True)
clean_nba_stand.ix[clean_nba_stand.team_abbrev == 'LA', 'cityName'] = 'LA'
return clean_nba_stand
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment