Last active
January 26, 2023 21:58
-
-
Save bendominguez0111/c14d85711755010e0a12561f4de434a0 to your computer and use it in GitHub Desktop.
Script to web scrape Fantasy Football data from pro-football-reference.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import pandas as pd | |
import requests | |
from sys import argv | |
import os | |
pd.options.mode.chained_assignment = None | |
# pd.set_option('display.max_columns', 500) | |
# pd.set_option('display.max_rows', 500) | |
passing_url = """ | |
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&offset={offset} | |
""" | |
receiving_url = """ | |
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rec&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rec_yds&from_link=1&offset={offset} | |
""" | |
rushing_url = """ | |
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset={offset} | |
""" | |
""" | |
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2019&year_max=2019&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset=0 | |
""" | |
fumbles_url = """ | |
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&pos%5B%5D=QB&pos%5B%5D=WR&pos%5B%5D=RB&pos%5B%5D=TE&pos%5B%5D=OL&pos%5B%5D=DL&pos%5B%5D=LB&pos%5B%5D=DB&is_starter=E&game_type=R&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&c1stat=fumbles&c1comp=gt&c1val=1&c5val=1.0&order_by=player | |
""" | |
urls = { | |
'passing': passing_url, | |
'rushing': rushing_url, | |
'receiving': receiving_url, | |
'fumbles': fumbles_url | |
} | |
def grab_df_from_url(url, offset, year, week): | |
r = requests.get(url.format(year=year, week=week, offset=offset)) | |
soup = BeautifulSoup(r.content, 'html.parser') | |
table = soup.find('table', {'id': 'results'}) | |
df = pd.read_html(str(table))[0] | |
df.columns = df.columns.droplevel(level=0) | |
df.drop(['Result', 'Week', 'G#', 'Opp', 'Unnamed: 7_level_1', 'Age', 'Rk', 'Lg', 'Date', 'Day'], axis=1, inplace=True) | |
df = df[df['Pos'] != 'Pos'] | |
return df | |
def create_category_df(key, year, week): | |
url = urls[key] | |
num_offsets = 0 | |
df = grab_df_from_url(url, 0, year, week) | |
#df.shape[0] represents the number of rows in the DataFrame | |
#run this block while we still have a full ste of rows | |
#if, for example, df.shape[0] = 232 and we have 2 offsets, then 232 - 2*100 = 32. | |
#if that's the case, we know we have reached the end of the data. | |
if df.shape[0] == 100: | |
while df.shape[0] - num_offsets*100 == 100: | |
num_offsets = num_offsets + 1 | |
next_df = grab_df_from_url(url, num_offsets*100, year, week) | |
#this is here just in case there is exactly 200 pass catchers in a week (for example). | |
if next_df.shape[0] == 0: | |
break | |
df = pd.concat([df, next_df]) | |
return df | |
def format_df(df, key): | |
df.set_index(['Player', 'Pos', 'Tm'], inplace=True) | |
if key == 'passing': | |
df = df[['Yds', 'TD', 'Int', 'Att', 'Cmp']] | |
df.rename({'Yds': 'PassingYds', 'Att': 'PassingAtt', 'Y/A': 'Y/PassingAtt', 'TD': 'PassingTD'}, axis=1, inplace=True) | |
elif key =='receiving': | |
df = df[['Rec', 'Tgt', 'Yds', 'TD']] | |
df.rename({'Yds': 'ReceivingYds', 'TD': 'ReceivingTD'}, axis=1, inplace=True) | |
elif key == 'rushing': | |
df.drop('Y/A', axis=1, inplace=True) | |
df.rename({'Att': 'RushingAtt', 'Yds': 'RushingYds', 'TD': 'RushingTD'}, axis=1, inplace=True) | |
elif key == 'fumbles': | |
df = df[['FL']] | |
return df | |
def main(year, week): | |
#create dataframes, append them to a list, and outer join all of them | |
dfs = [] | |
for key in urls.keys(): | |
df = create_category_df(key, year, week) | |
df = format_df(df, key) | |
dfs.append(df) | |
df = dfs[0] | |
for next_df in dfs[1:]: | |
df = df.merge(next_df, on=['Player', 'Pos', 'Tm'], how='outer') | |
df.fillna(0, inplace=True) | |
if '--save' in argv: | |
dirname = 'web_scraped_data' | |
dir_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), dirname) | |
if not os.path.exists(dir_path): | |
os.mkdir(dir_path) | |
year_path = os.path.join(dir_path, str(year)) | |
if not os.path.exists(year_path): | |
os.mkdir(year_path) | |
df.to_csv('web_scraped_data/{}/week{}.csv'.format(year, week)) | |
else: | |
print(df.head()) | |
year = input('input a year ') | |
for week in range(1, 18): | |
main(year, week) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment