bendominguez0111/webscraping.py

## webscraping.py
from bs4 import BeautifulSoup
import pandas as pd
import requests
from sys import argv
import os

pd.options.mode.chained_assignment = None
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)

passing_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&offset={offset}
"""

receiving_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rec&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rec_yds&from_link=1&offset={offset}
"""

rushing_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset={offset}
"""

"""
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2019&year_max=2019&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset=0
"""

fumbles_url = """
https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&pos%5B%5D=QB&pos%5B%5D=WR&pos%5B%5D=RB&pos%5B%5D=TE&pos%5B%5D=OL&pos%5B%5D=DL&pos%5B%5D=LB&pos%5B%5D=DB&is_starter=E&game_type=R&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&c1stat=fumbles&c1comp=gt&c1val=1&c5val=1.0&order_by=player
"""

urls = {
    'passing': passing_url,
    'rushing': rushing_url,
    'receiving': receiving_url,
    'fumbles': fumbles_url
}

def grab_df_from_url(url, offset, year, week):

    r = requests.get(url.format(year=year, week=week, offset=offset))

    soup = BeautifulSoup(r.content, 'html.parser')
    table = soup.find('table', {'id': 'results'})

    df = pd.read_html(str(table))[0]

    df.columns = df.columns.droplevel(level=0)
    df.drop(['Result', 'Week', 'G#', 'Opp', 'Unnamed: 7_level_1', 'Age', 'Rk', 'Lg', 'Date', 'Day'], axis=1, inplace=True)
    df = df[df['Pos'] != 'Pos']

    return df

def create_category_df(key, year, week):
    url = urls[key]
    num_offsets = 0
    df = grab_df_from_url(url, 0, year, week)

    #df.shape[0] represents the number of rows in the DataFrame
    #run this block while we still have a full ste of rows
    #if, for example, df.shape[0] = 232 and we have 2 offsets, then 232 - 2*100 = 32.
    #if that's the case, we know we have reached the end of the data.
    if df.shape[0] == 100:
        while df.shape[0] - num_offsets*100 == 100:

            num_offsets = num_offsets + 1
            next_df = grab_df_from_url(url, num_offsets*100, year, week)

            #this is here just in case there is exactly 200 pass catchers in a week (for example).
            if next_df.shape[0] == 0:
                break

            df = pd.concat([df, next_df])

        return df

def format_df(df, key):

    df.set_index(['Player', 'Pos', 'Tm'], inplace=True)

    if key == 'passing':
        df = df[['Yds', 'TD', 'Int', 'Att', 'Cmp']]
        df.rename({'Yds': 'PassingYds', 'Att': 'PassingAtt', 'Y/A': 'Y/PassingAtt', 'TD': 'PassingTD'}, axis=1, inplace=True)
    elif key =='receiving':
        df = df[['Rec', 'Tgt', 'Yds', 'TD']]
        df.rename({'Yds': 'ReceivingYds', 'TD': 'ReceivingTD'}, axis=1, inplace=True)
    elif key == 'rushing':
        df.drop('Y/A', axis=1, inplace=True)
        df.rename({'Att': 'RushingAtt', 'Yds': 'RushingYds', 'TD': 'RushingTD'}, axis=1, inplace=True)
    elif key == 'fumbles':
        df = df[['FL']]

    return df

def main(year, week):
    #create dataframes, append them to a list, and outer join all of them
    dfs = []

    for key in urls.keys():
        df = create_category_df(key, year, week)
        df = format_df(df, key)
        dfs.append(df)

    df = dfs[0]
    for next_df in dfs[1:]:
        df = df.merge(next_df, on=['Player', 'Pos', 'Tm'],  how='outer')

    df.fillna(0, inplace=True)

    if '--save' in argv:

        dirname = 'web_scraped_data'

        dir_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), dirname)
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)

        year_path = os.path.join(dir_path, str(year))
        if not os.path.exists(year_path):
            os.mkdir(year_path)

        df.to_csv('web_scraped_data/{}/week{}.csv'.format(year, week))
    else:
        print(df.head())

year = input('input a year ')
for week in range(1, 18):
    main(year, week)
	from bs4 import BeautifulSoup
	import pandas as pd
	import requests
	from sys import argv
	import os

	pd.options.mode.chained_assignment = None
	# pd.set_option('display.max_columns', 500)
	# pd.set_option('display.max_rows', 500)

	passing_url = """
	https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1&offset={offset}
	"""

	receiving_url = """
	https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rec&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rec_yds&from_link=1&offset={offset}
	"""

	rushing_url = """
	https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset={offset}
	"""

	"""
	https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2019&year_max=2019&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min=1&week_num_max=1&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1&offset=0
	"""

	fumbles_url = """
	https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&pos%5B%5D=QB&pos%5B%5D=WR&pos%5B%5D=RB&pos%5B%5D=TE&pos%5B%5D=OL&pos%5B%5D=DL&pos%5B%5D=LB&pos%5B%5D=DB&is_starter=E&game_type=R&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&c1stat=fumbles&c1comp=gt&c1val=1&c5val=1.0&order_by=player
	"""

	urls = {
	'passing': passing_url,
	'rushing': rushing_url,
	'receiving': receiving_url,
	'fumbles': fumbles_url
	}

	def grab_df_from_url(url, offset, year, week):

	r = requests.get(url.format(year=year, week=week, offset=offset))

	soup = BeautifulSoup(r.content, 'html.parser')
	table = soup.find('table', {'id': 'results'})

	df = pd.read_html(str(table))[0]

	df.columns = df.columns.droplevel(level=0)
	df.drop(['Result', 'Week', 'G#', 'Opp', 'Unnamed: 7_level_1', 'Age', 'Rk', 'Lg', 'Date', 'Day'], axis=1, inplace=True)
	df = df[df['Pos'] != 'Pos']

	return df

	def create_category_df(key, year, week):
	url = urls[key]
	num_offsets = 0
	df = grab_df_from_url(url, 0, year, week)

	#df.shape[0] represents the number of rows in the DataFrame
	#run this block while we still have a full ste of rows
	#if, for example, df.shape[0] = 232 and we have 2 offsets, then 232 - 2*100 = 32.
	#if that's the case, we know we have reached the end of the data.
	if df.shape[0] == 100:
	while df.shape[0] - num_offsets*100 == 100:

	num_offsets = num_offsets + 1
	next_df = grab_df_from_url(url, num_offsets*100, year, week)

	#this is here just in case there is exactly 200 pass catchers in a week (for example).
	if next_df.shape[0] == 0:
	break

	df = pd.concat([df, next_df])

	return df

	def format_df(df, key):

	df.set_index(['Player', 'Pos', 'Tm'], inplace=True)

	if key == 'passing':
	df = df[['Yds', 'TD', 'Int', 'Att', 'Cmp']]
	df.rename({'Yds': 'PassingYds', 'Att': 'PassingAtt', 'Y/A': 'Y/PassingAtt', 'TD': 'PassingTD'}, axis=1, inplace=True)
	elif key =='receiving':
	df = df[['Rec', 'Tgt', 'Yds', 'TD']]
	df.rename({'Yds': 'ReceivingYds', 'TD': 'ReceivingTD'}, axis=1, inplace=True)
	elif key == 'rushing':
	df.drop('Y/A', axis=1, inplace=True)
	df.rename({'Att': 'RushingAtt', 'Yds': 'RushingYds', 'TD': 'RushingTD'}, axis=1, inplace=True)
	elif key == 'fumbles':
	df = df[['FL']]

	return df

	def main(year, week):
	#create dataframes, append them to a list, and outer join all of them
	dfs = []

	for key in urls.keys():
	df = create_category_df(key, year, week)
	df = format_df(df, key)
	dfs.append(df)

	df = dfs[0]
	for next_df in dfs[1:]:
	df = df.merge(next_df, on=['Player', 'Pos', 'Tm'], how='outer')

	df.fillna(0, inplace=True)

	if '--save' in argv:

	dirname = 'web_scraped_data'

	dir_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), dirname)
	if not os.path.exists(dir_path):
	os.mkdir(dir_path)

	year_path = os.path.join(dir_path, str(year))
	if not os.path.exists(year_path):
	os.mkdir(year_path)

	df.to_csv('web_scraped_data/{}/week{}.csv'.format(year, week))
	else:
	print(df.head())

	year = input('input a year ')
	for week in range(1, 18):
	main(year, week)