#Import Libraries & set headers | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
headers = {'User-Agent': | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} | |
#Start on league page & create a list of links to each team | |
page = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1" | |
pageTree = requests.get(page, headers=headers) | |
pageSoup = BeautifulSoup(pageTree.content, 'html.parser') | |
teamLinks = pageSoup.find_all('a', class_='vereinprofil_tooltip')[0:60] | |
#Add domain to links | |
linksList = [] | |
for each in teamLinks[::3]: | |
linksList.append('https://www.transfermarkt.com' + each['href']) | |
#Set up empty lists that will become our dataframe | |
playerName = [] | |
playerValue = [] | |
playerPosition = [] | |
playerTeam = [] | |
#Go through each team, and get their page ready to scrape player info | |
for team in linksList: | |
teamName = team.split('/')[3] | |
teamPageTree = requests.get(team, headers=headers) | |
teamPageSoup = BeautifulSoup(teamPageTree.content, 'html.parser') | |
teamTable = teamPageSoup.find_all("table", {"class": "items"}) | |
teamRows = teamTable[0].find_all("tr") | |
#Go through each player and get their details, appending them to the relevant list | |
for each in teamRows[1::3]: | |
playerValue.append(each.find_all('td', {'class':'rechts'})[0].get_text().replace('\xa0','')) | |
playerName.append(each.find_all('a', {'class':'spielprofil_tooltip'})[0].get_text()) | |
playerPosition.append(each.find("td")['title']) | |
playerTeam.append(teamName) | |
#Group our lists into a dataframe | |
df = pd.DataFrame({'Player':playerName,'Team':playerTeam, 'Position':playerPosition, 'Value':playerValue}) | |
#Function to parse the weird value format into an integer | |
def calculateValue(value): | |
unit = ''.join(filter(str.isalpha, value)) | |
number = ''.join(filter(str.isnumeric, value)) | |
numValue = 0 | |
if unit == 'm': | |
numValue = int(number) * 10000 | |
elif unit == 'Th': | |
numValue = int(number) * 1000 | |
return numValue | |
#Use this function to create a new column | |
df['NumValue'] = df.apply(lambda x : calculateValue(x['Value']) , axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment