Last active
September 12, 2021 19:35
-
-
Save MickeyPvX/1504cc675b9339ade937cc578b8c8d7c to your computer and use it in GitHub Desktop.
Python class to pull ESPN Fantasy Football data into a pandas DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
from urllib.request import urlopen | |
from bs4 import BeautifulSoup as soup | |
from datetime import datetime as dt | |
class ESPN_FFL(object): | |
"""Helper class for pulling ESPN Fantasy Football stats for a given season. | |
***WILL ONLY WORK ON PUBLIC LEAGUES*** | |
Attributes: | |
league_id (int): ID of the league for which to pull data | |
url_base (str): Base URL for scraping the ESPN Fantasy Football site | |
pos_map (dict): Mapping dictionary of player positions vs their slotCategoryId | |
ty_file (file): File with data so far this season to import | |
ly_file (file): File with data from last year's season | |
web_data (pandas.DataFrame): Current data stored from ESPN site | |
text_pattern (_sre.SRE_Pattern): Text pattern to clean from player names | |
reverse (bool): If True, returns Weeks columns L=>R from most recent to least recent | |
Methods: | |
get_web_data: Scrapes ESPN Fantasy Football site for player points; stores in self.web_data | |
get_file: Loads data from excel file (must be in same format as web_data) | |
""" | |
def __init__(self, league_id, ty_file=None, ly_file=None, reverse=False): | |
self.league_id = league_id | |
self.url_base = 'http://games.espn.com/ffl/leaders?leagueId={}&scoringPeriodId={}&seasonId={}&slotCategoryId={}&startIndex={}' | |
self.pos_map = {0:'QB',2:'RB',4:'WR',6:'TE',16:'D_ST',17:'K'} | |
self.ty_file = self.get_file(ty_file, reverse=reverse) | |
self.ly_file = self.get_file(ly_file, reverse=reverse) | |
self.web_data = pd.DataFrame() | |
self.text_pattern = re.compile(r'([^\s\w]|_)+', re.UNICODE) | |
def get_web_data(self, weeks=None, positions=None, season=None, owner=None, clear=True, reverse=False): | |
"""Gets data from ESPN site for weeks specified and positions requested | |
Args: | |
weeks (int/list): Weeks to pull - int returns # of weeks starting from week 1 or latest week from ty_file; | |
list returns the weeks in the list | |
positions (list[str]): Player positions to pull (list of strings) | |
season (int): Year of the season to pull | |
owner (str/None): User can pull the current ('last') or historic ('all') league owner of each player | |
clear (bool): When True, clears current web_data before loading from site | |
reverse (bool): Same as reverse in class __init__ | |
""" | |
if clear: | |
self.web_data = pd.DataFrame() | |
ty_week = 0 | |
ty_weeks = [] | |
if not self.ty_file.empty: | |
ty_weeks = [int(col[col.find('_') + 1:]) for col in self.ty_file.columns if 'WEEK_' in col] | |
ty_week = max(ty_weeks) + 1 | |
self.web_data = self.ty_file.copy() | |
if isinstance(weeks, int): | |
if sum([ty_week, weeks]) > 17: | |
if ty_week > 0: | |
raise ValueError('Last week is week {}; weeks value must be {} or less.'.format(ty_week, str(17 - ty_week))) | |
else: | |
raise ValueError('Number of weeks must be less than 18') | |
else: | |
rngloop = range(max([ty_week, 1]), sum([ty_week, weeks]) + 1) | |
elif isinstance(weeks, list): | |
rngloop = [int(x) for x in weeks if 1 <= x <= 17 and x not in ty_weeks] | |
else: | |
raise TypeError('Weeks must be provided as int or list') | |
if not isinstance(positions, (list, type(None))): | |
raise TypeError('Positions must be a list of strings or None') | |
elif isinstance(positions, list): | |
if all(isinstance(x, str) for x in positions): | |
positions = [pos for pos in self.pos_map.keys() if self.pos_map[pos].upper() in positions] | |
else: | |
raise TypeError('All elements of positions list must be strings') | |
else: | |
positions = self.pos_map.keys() | |
if season is None: | |
season = dt.now().year | |
elif not isinstance(season, int): | |
raise TypeError('Season must be an integer year e.g. 2018') | |
for week in rngloop: | |
dfWork = pd.DataFrame() | |
for slot in positions: | |
print("Getting {} Week {} {}'s".format(str(season), str(week), self.pos_map[slot])) | |
for page in range(0, 500, 50): | |
url = self.url_base.format(str(self.league_id), str(week), str(season), str(slot), str(page)) | |
# Scrape the HTML at this URL | |
req = urlopen(url) | |
hot_soup = soup(req, 'html.parser') | |
data_rows = hot_soup.findAll('tr')[2:] # Skip header row and spacers | |
# Pull Data | |
pulldata = [[td.getText() for td in data_rows[i].findAll(['td'])] | |
for i in range(len(data_rows))] | |
try: | |
dfTemp = pd.DataFrame(pulldata[1:], columns=pulldata[0])[['PLAYER, TEAM POS', 'PTS', 'TYPE']] | |
except AssertionError: | |
for row in range(1, len(pulldata)): | |
pulldata[row].insert(5, 'BYE_WEEK') | |
dfTemp = pd.DataFrame(pulldata[1:], columns=pulldata[0])[['PLAYER, TEAM POS', 'PTS', 'TYPE']] | |
except IndexError: | |
continue | |
dfTemp['PLAYER'] = dfTemp['PLAYER, TEAM POS'].str.split(',').str[:1].apply(lambda x: ' '.join(x)).apply(lambda x: self.text_pattern.sub('', x)) | |
dfTemp['POS'] = self.pos_map[slot] | |
if owner is None: | |
cols = ['PLAYER', 'POS', 'PTS'] | |
elif owner == 'all': | |
cols = ['PLAYER', 'POS', 'TYPE', 'PTS'] | |
elif owner == 'last': | |
final_week = max(rngloop) | |
if week == final_week: | |
cols = ['PLAYER', 'POS', 'TYPE', 'PTS'] | |
else: | |
cols = ['PLAYER', 'POS', 'PTS'] | |
elif owner == 'owners-only': | |
cols = ['PLAYER', 'POS', 'TYPE'] | |
else: | |
raise ValueError("'owner' must be one of: None, 'all', 'last', 'owners-only'") | |
dfTemp = dfTemp[cols].set_index(['PLAYER','POS']) | |
dfTemp.rename(mapper={'PTS':'WEEK_{}'.format(week),'TYPE':'OWNER'}, axis=1, inplace=True) | |
dfWork = dfWork.append(dfTemp) | |
dfWork = dfWork[~dfWork.index.duplicated(keep='first')] | |
if not self.web_data.empty: | |
self.web_data = self.web_data.merge(dfWork, how='outer', on=['PLAYER','POS'], | |
suffixes=('', week)) | |
else: | |
self.web_data = dfWork.copy() | |
numcols = [col for col in self.web_data.columns if 'WEEK' in col] | |
self.web_data[numcols] = self.web_data[numcols].apply(lambda x: pd.to_numeric(x, errors='coerce')) | |
self.web_data.dropna(how='all') | |
if owner == 'last': | |
self.web_data.set_index(['OWNER'], append=True, inplace=True) | |
self.web_data = self.web_data[self._natural_sort(self.web_data.columns)] | |
if reverse: | |
self.web_data = self.web_data[self.web_data.columns[::-1]] | |
def get_file(self, filepath, reverse=False): | |
"""Pulls in data from an Excel file at specified file path | |
Args: | |
filepath (str): File path of data file | |
reverse (bool): Same as reverse in class __init__ | |
""" | |
if filepath is None: | |
return pd.DataFrame() | |
else: | |
dfImport = pd.read_excel(filepath) | |
dfImport.set_index(['PLAYER','POS'], inplace=True) | |
if reverse: | |
dfImport = dfImport[dfImport.columns[::-1]] | |
return dfImport | |
def _natural_sort(self, sort_list): | |
"""Overrides ASCII sorting in favor of a natural sorting order | |
Credit: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ | |
Args: | |
sort_list (list): List to be sorted naturally | |
""" | |
sort_list = list(sort_list) | |
convert = lambda text: int(text) if text.isdigit() else text | |
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] | |
sort_list.sort(key=alphanum_key) | |
return sort_list | |
if __name__ == '__main__': | |
leagueID = int(input('Enter league ID: ')) | |
ffl_test = ESPN_FFL(league_id=leagueID) | |
ffl_test.get_web_data(weeks=2, positions=['QB'], owner='last') | |
print(ffl_test.web_data.head()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Under construction! We still need to scrape for free-agency player data, | |
but this will make getting other data much faster. """ | |
import requests | |
espn_api_base = 'http://games.espn.com/ffl/api/v2/{}' | |
slots = {0: 'QB', 2: 'RB', 4: 'WR', 6: 'TE', 16: 'D/ST', 17: 'K', 20: 'BE', 23: 'FLEX'} | |
league_id = int(input('Enter LeagueID: ')) | |
# Get league team info | |
req = requests.get(espn_api_base.format('leagueSettings'), | |
params={'leagueId': league_id, 'seasonId': 2018}) | |
team_info = req.json()['leaguesettings']['teams'] | |
team_info = {team_id: {'name': team_info[team_id]['teamAbbrev'], | |
'wins': team_info[team_id]['record']['overallWins'], | |
'losses': team_info[team_id]['record']['overallLosses'], | |
'pts_for': team_info[team_id]['record']['pointsFor'], | |
'pts_against': team_info[team_id]['record']['pointsAgainst']} for team_id in team_info} | |
# Get player info | |
plyr_req = requests.get(espn_api_base.format('playerInfo'), | |
params={'leagueId': league_id, 'seasonId': 2018}) | |
plyr_info = plyr_req.json()['playerInfo']['players'] | |
# Get Points from players on rosters | |
player_pts = {} | |
for week in range(1, 10): | |
player_pts['Week_{}'.format(week)] = {} | |
for team_id in team_info.keys(): | |
pts_req = requests.get(espn_api_base.format('boxscore'), | |
params={'leagueId': league_id, 'seasonId': 2018, | |
'teamId': team_id, 'matchupPeriodId': week}) | |
player_pts['Week_{}'.format(week)][team_info[team_id]['name']] = pts_req.json()['boxscore']['teams'][0]['slots'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment