Skip to content

Instantly share code, notes, and snippets.

@sakamer71
Created April 18, 2022 22:31
Show Gist options
  • Save sakamer71/f3415a952b67b606ac36ebe43caca15a to your computer and use it in GitHub Desktop.
Save sakamer71/f3415a952b67b606ac36ebe43caca15a to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
#import re
import pandas as pd
baseurl = 'https://www.pro-football-reference.com'
year = 2021
playerDataFile = f"playerData_{year}.csv"
#ffFile = 'fffile.csv'
def getPlayerInfo(name, url,stub):
url += stub
#r = requests.get('https://www.pro-football-reference.com//players/T/TaylJo02.htm')
#print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
#print(soup.prettify())
#print(soup.title)
#exit()
try:
position = soup.find("meta", {"name":"Description"})["content"].split(',')[0].split(':')[-1].strip()
except:
position = "unknown"
try:
height = soup.find("span", itemprop="height").text.strip()
except:
height = "unknown"
try:
weight = soup.find("span", itemprop="weight").text.strip()
except:
weight = "unknown"
try:
team = soup.find("span", itemprop="affiliation").text.strip()
except:
team="NA"
try:
birthDate = soup.find("span", itemprop="birthDate").text.strip()
except:
birthDate = "unknown"
try:
college = list(soup.find("strong", text="College").next_siblings)[1].text.strip()
except:
college = "NA"
print(name, height, weight, team, position, birthDate, college)
return (name, height, weight, team, position, birthDate, college)
def renameColumns(df):
newCols = ['OverallRank','Name','Team','Position','Age','Games','GamesStarted','PassingCmp','PassingAtt','PassingYds','PassingTD','PassingInt','RushingAtt','RushingYds','RushingYPA','RushingTD','ReceivingTgt','ReceivingRec','ReceivingYds','ReceivingYPR','ReceivingTD','Fumbles','FumblesLost','ScoringTD','Scoring2PM','Scoring2PP','FantasyFantPt','FantasyPPR','FantasyDKPt','FantasyFDPt','FantasyVBD','FantasyPosRank','FantasyOvRank']
df.columns = newCols
return df
def main():
playerData = []
url = baseurl + '/years/' + str(year) + '/fantasy.htm'
r = requests.get(url)
ff_df = pd.read_html(url)[0]
ff_df = renameColumns(ff_df)
print(ff_df.head(5))
#print(len(ff_df))
ff_df['Name'] = ff_df['Name'].str.replace("*","").str.replace('+','')
print(ff_df.head(5))
#exit()
print(ff_df.columns)
#print(type(names))
#ff_df.to_csv(ffFile)
soup = BeautifulSoup(r.content, 'html.parser')
# #print(soup)
parsed_table = soup.find_all('table')[0]
#print(parsed_table)
for i,row in enumerate(parsed_table.find_all('tr')[2:]):
dat = row.find('td', attrs={'data-stat': 'player'})
if dat:
name = dat.a.get_text()
stub = dat.a.get('href')
#print(name, stub)
playerData.append(getPlayerInfo(name, baseurl, stub))
#print(playerData)
df = pd.DataFrame(playerData, columns=["Name", "Height", "Weight", "Team", "Position", "BirthDate", "College"])
merged = pd.merge(df, ff_df,how='left', on='Name', suffixes=('', '_remove') )
merged['Height'] = merged['Height'].astype('string')
merged.drop([i for i in merged.columns if '_remove' in i], axis=1, inplace=True)
merged.to_csv(playerDataFile)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment