Skip to content

Instantly share code, notes, and snippets.

@bendominguez0111
Last active January 26, 2023 21:58
Show Gist options
  • Save bendominguez0111/c14d85711755010e0a12561f4de434a0 to your computer and use it in GitHub Desktop.
Save bendominguez0111/c14d85711755010e0a12561f4de434a0 to your computer and use it in GitHub Desktop.
Script to web scrape Fantasy Football data from pro-football-reference.com
from bs4 import BeautifulSoup
import pandas as pd
import requests
from sys import argv
import os
pd.options.mode.chained_assignment = None
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)
passing_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&offset={offset}
"""
receiving_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rec&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rec_yds&from_link=1&offset={offset}
"""
rushing_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset={offset}
"""
"""
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2019&year_max=2019&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset=0
"""
fumbles_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&pos%5B%5D=QB&pos%5B%5D=WR&pos%5B%5D=RB&pos%5B%5D=TE&pos%5B%5D=OL&pos%5B%5D=DL&pos%5B%5D=LB&pos%5B%5D=DB&is_starter=E&game_type=R&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&c1stat=fumbles&c1comp=gt&c1val=1&c5val=1.0&order_by=player
"""
urls = {
'passing': passing_url,
'rushing': rushing_url,
'receiving': receiving_url,
'fumbles': fumbles_url
}
def grab_df_from_url(url, offset, year, week):
r = requests.get(url.format(year=year, week=week, offset=offset))
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find('table', {'id': 'results'})
df = pd.read_html(str(table))[0]
df.columns = df.columns.droplevel(level=0)
df.drop(['Result', 'Week', 'G#', 'Opp', 'Unnamed: 7_level_1', 'Age', 'Rk', 'Lg', 'Date', 'Day'], axis=1, inplace=True)
df = df[df['Pos'] != 'Pos']
return df
def create_category_df(key, year, week):
url = urls[key]
num_offsets = 0
df = grab_df_from_url(url, 0, year, week)
#df.shape[0] represents the number of rows in the DataFrame
#run this block while we still have a full ste of rows
#if, for example, df.shape[0] = 232 and we have 2 offsets, then 232 - 2*100 = 32.
#if that's the case, we know we have reached the end of the data.
if df.shape[0] == 100:
while df.shape[0] - num_offsets*100 == 100:
num_offsets = num_offsets + 1
next_df = grab_df_from_url(url, num_offsets*100, year, week)
#this is here just in case there is exactly 200 pass catchers in a week (for example).
if next_df.shape[0] == 0:
break
df = pd.concat([df, next_df])
return df
def format_df(df, key):
df.set_index(['Player', 'Pos', 'Tm'], inplace=True)
if key == 'passing':
df = df[['Yds', 'TD', 'Int', 'Att', 'Cmp']]
df.rename({'Yds': 'PassingYds', 'Att': 'PassingAtt', 'Y/A': 'Y/PassingAtt', 'TD': 'PassingTD'}, axis=1, inplace=True)
elif key =='receiving':
df = df[['Rec', 'Tgt', 'Yds', 'TD']]
df.rename({'Yds': 'ReceivingYds', 'TD': 'ReceivingTD'}, axis=1, inplace=True)
elif key == 'rushing':
df.drop('Y/A', axis=1, inplace=True)
df.rename({'Att': 'RushingAtt', 'Yds': 'RushingYds', 'TD': 'RushingTD'}, axis=1, inplace=True)
elif key == 'fumbles':
df = df[['FL']]
return df
def main(year, week):
#create dataframes, append them to a list, and outer join all of them
dfs = []
for key in urls.keys():
df = create_category_df(key, year, week)
df = format_df(df, key)
dfs.append(df)
df = dfs[0]
for next_df in dfs[1:]:
df = df.merge(next_df, on=['Player', 'Pos', 'Tm'], how='outer')
df.fillna(0, inplace=True)
if '--save' in argv:
dirname = 'web_scraped_data'
dir_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), dirname)
if not os.path.exists(dir_path):
os.mkdir(dir_path)
year_path = os.path.join(dir_path, str(year))
if not os.path.exists(year_path):
os.mkdir(year_path)
df.to_csv('web_scraped_data/{}/week{}.csv'.format(year, week))
else:
print(df.head())
year = input('input a year ')
for week in range(1, 18):
main(year, week)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment