Skip to content

Instantly share code, notes, and snippets.

@MickeyPvX
Last active September 12, 2021 19:35
Show Gist options
  • Save MickeyPvX/1504cc675b9339ade937cc578b8c8d7c to your computer and use it in GitHub Desktop.
Save MickeyPvX/1504cc675b9339ade937cc578b8c8d7c to your computer and use it in GitHub Desktop.
Python class to pull ESPN Fantasy Football data into a pandas DataFrame
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
from datetime import datetime as dt
class ESPN_FFL(object):
"""Helper class for pulling ESPN Fantasy Football stats for a given season.
***WILL ONLY WORK ON PUBLIC LEAGUES***
Attributes:
league_id (int): ID of the league for which to pull data
url_base (str): Base URL for scraping the ESPN Fantasy Football site
pos_map (dict): Mapping dictionary of player positions vs their slotCategoryId
ty_file (file): File with data so far this season to import
ly_file (file): File with data from last year's season
web_data (pandas.DataFrame): Current data stored from ESPN site
text_pattern (_sre.SRE_Pattern): Text pattern to clean from player names
reverse (bool): If True, returns Weeks columns L=>R from most recent to least recent
Methods:
get_web_data: Scrapes ESPN Fantasy Football site for player points; stores in self.web_data
get_file: Loads data from excel file (must be in same format as web_data)
"""
def __init__(self, league_id, ty_file=None, ly_file=None, reverse=False):
self.league_id = league_id
self.url_base = 'http://games.espn.com/ffl/leaders?leagueId={}&scoringPeriodId={}&seasonId={}&slotCategoryId={}&startIndex={}'
self.pos_map = {0:'QB',2:'RB',4:'WR',6:'TE',16:'D_ST',17:'K'}
self.ty_file = self.get_file(ty_file, reverse=reverse)
self.ly_file = self.get_file(ly_file, reverse=reverse)
self.web_data = pd.DataFrame()
self.text_pattern = re.compile(r'([^\s\w]|_)+', re.UNICODE)
def get_web_data(self, weeks=None, positions=None, season=None, owner=None, clear=True, reverse=False):
"""Gets data from ESPN site for weeks specified and positions requested
Args:
weeks (int/list): Weeks to pull - int returns # of weeks starting from week 1 or latest week from ty_file;
list returns the weeks in the list
positions (list[str]): Player positions to pull (list of strings)
season (int): Year of the season to pull
owner (str/None): User can pull the current ('last') or historic ('all') league owner of each player
clear (bool): When True, clears current web_data before loading from site
reverse (bool): Same as reverse in class __init__
"""
if clear:
self.web_data = pd.DataFrame()
ty_week = 0
ty_weeks = []
if not self.ty_file.empty:
ty_weeks = [int(col[col.find('_') + 1:]) for col in self.ty_file.columns if 'WEEK_' in col]
ty_week = max(ty_weeks) + 1
self.web_data = self.ty_file.copy()
if isinstance(weeks, int):
if sum([ty_week, weeks]) > 17:
if ty_week > 0:
raise ValueError('Last week is week {}; weeks value must be {} or less.'.format(ty_week, str(17 - ty_week)))
else:
raise ValueError('Number of weeks must be less than 18')
else:
rngloop = range(max([ty_week, 1]), sum([ty_week, weeks]) + 1)
elif isinstance(weeks, list):
rngloop = [int(x) for x in weeks if 1 <= x <= 17 and x not in ty_weeks]
else:
raise TypeError('Weeks must be provided as int or list')
if not isinstance(positions, (list, type(None))):
raise TypeError('Positions must be a list of strings or None')
elif isinstance(positions, list):
if all(isinstance(x, str) for x in positions):
positions = [pos for pos in self.pos_map.keys() if self.pos_map[pos].upper() in positions]
else:
raise TypeError('All elements of positions list must be strings')
else:
positions = self.pos_map.keys()
if season is None:
season = dt.now().year
elif not isinstance(season, int):
raise TypeError('Season must be an integer year e.g. 2018')
for week in rngloop:
dfWork = pd.DataFrame()
for slot in positions:
print("Getting {} Week {} {}'s".format(str(season), str(week), self.pos_map[slot]))
for page in range(0, 500, 50):
url = self.url_base.format(str(self.league_id), str(week), str(season), str(slot), str(page))
# Scrape the HTML at this URL
req = urlopen(url)
hot_soup = soup(req, 'html.parser')
data_rows = hot_soup.findAll('tr')[2:] # Skip header row and spacers
# Pull Data
pulldata = [[td.getText() for td in data_rows[i].findAll(['td'])]
for i in range(len(data_rows))]
try:
dfTemp = pd.DataFrame(pulldata[1:], columns=pulldata[0])[['PLAYER, TEAM POS', 'PTS', 'TYPE']]
except AssertionError:
for row in range(1, len(pulldata)):
pulldata[row].insert(5, 'BYE_WEEK')
dfTemp = pd.DataFrame(pulldata[1:], columns=pulldata[0])[['PLAYER, TEAM POS', 'PTS', 'TYPE']]
except IndexError:
continue
dfTemp['PLAYER'] = dfTemp['PLAYER, TEAM POS'].str.split(',').str[:1].apply(lambda x: ' '.join(x)).apply(lambda x: self.text_pattern.sub('', x))
dfTemp['POS'] = self.pos_map[slot]
if owner is None:
cols = ['PLAYER', 'POS', 'PTS']
elif owner == 'all':
cols = ['PLAYER', 'POS', 'TYPE', 'PTS']
elif owner == 'last':
final_week = max(rngloop)
if week == final_week:
cols = ['PLAYER', 'POS', 'TYPE', 'PTS']
else:
cols = ['PLAYER', 'POS', 'PTS']
elif owner == 'owners-only':
cols = ['PLAYER', 'POS', 'TYPE']
else:
raise ValueError("'owner' must be one of: None, 'all', 'last', 'owners-only'")
dfTemp = dfTemp[cols].set_index(['PLAYER','POS'])
dfTemp.rename(mapper={'PTS':'WEEK_{}'.format(week),'TYPE':'OWNER'}, axis=1, inplace=True)
dfWork = dfWork.append(dfTemp)
dfWork = dfWork[~dfWork.index.duplicated(keep='first')]
if not self.web_data.empty:
self.web_data = self.web_data.merge(dfWork, how='outer', on=['PLAYER','POS'],
suffixes=('', week))
else:
self.web_data = dfWork.copy()
numcols = [col for col in self.web_data.columns if 'WEEK' in col]
self.web_data[numcols] = self.web_data[numcols].apply(lambda x: pd.to_numeric(x, errors='coerce'))
self.web_data.dropna(how='all')
if owner == 'last':
self.web_data.set_index(['OWNER'], append=True, inplace=True)
self.web_data = self.web_data[self._natural_sort(self.web_data.columns)]
if reverse:
self.web_data = self.web_data[self.web_data.columns[::-1]]
def get_file(self, filepath, reverse=False):
"""Pulls in data from an Excel file at specified file path
Args:
filepath (str): File path of data file
reverse (bool): Same as reverse in class __init__
"""
if filepath is None:
return pd.DataFrame()
else:
dfImport = pd.read_excel(filepath)
dfImport.set_index(['PLAYER','POS'], inplace=True)
if reverse:
dfImport = dfImport[dfImport.columns[::-1]]
return dfImport
def _natural_sort(self, sort_list):
"""Overrides ASCII sorting in favor of a natural sorting order
Credit: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
Args:
sort_list (list): List to be sorted naturally
"""
sort_list = list(sort_list)
convert = lambda text: int(text) if text.isdigit() else text
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
sort_list.sort(key=alphanum_key)
return sort_list
if __name__ == '__main__':
leagueID = int(input('Enter league ID: '))
ffl_test = ESPN_FFL(league_id=leagueID)
ffl_test.get_web_data(weeks=2, positions=['QB'], owner='last')
print(ffl_test.web_data.head())
""" Under construction! We still need to scrape for free-agency player data,
but this will make getting other data much faster. """
import requests
espn_api_base = 'http://games.espn.com/ffl/api/v2/{}'
slots = {0: 'QB', 2: 'RB', 4: 'WR', 6: 'TE', 16: 'D/ST', 17: 'K', 20: 'BE', 23: 'FLEX'}
league_id = int(input('Enter LeagueID: '))
# Get league team info
req = requests.get(espn_api_base.format('leagueSettings'),
params={'leagueId': league_id, 'seasonId': 2018})
team_info = req.json()['leaguesettings']['teams']
team_info = {team_id: {'name': team_info[team_id]['teamAbbrev'],
'wins': team_info[team_id]['record']['overallWins'],
'losses': team_info[team_id]['record']['overallLosses'],
'pts_for': team_info[team_id]['record']['pointsFor'],
'pts_against': team_info[team_id]['record']['pointsAgainst']} for team_id in team_info}
# Get player info
plyr_req = requests.get(espn_api_base.format('playerInfo'),
params={'leagueId': league_id, 'seasonId': 2018})
plyr_info = plyr_req.json()['playerInfo']['players']
# Get Points from players on rosters
player_pts = {}
for week in range(1, 10):
player_pts['Week_{}'.format(week)] = {}
for team_id in team_info.keys():
pts_req = requests.get(espn_api_base.format('boxscore'),
params={'leagueId': league_id, 'seasonId': 2018,
'teamId': team_id, 'matchupPeriodId': week})
player_pts['Week_{}'.format(week)][team_info[team_id]['name']] = pts_req.json()['boxscore']['teams'][0]['slots']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment