Last active
September 13, 2020 16:20
-
-
Save dosanjos44/6053d2e187647af57b589ccad212f00d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import Libraries & set headers | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
headers = {'User-Agent': | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} | |
#Start on league page & create a list of links to each team | |
page = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1" | |
pageTree = requests.get(page, headers=headers) | |
pageSoup = BeautifulSoup(pageTree.content, 'html.parser') | |
teamLinks = pageSoup.find_all('a', class_='vereinprofil_tooltip')[0:60] | |
#Add domain to links | |
linksList = [] | |
for each in teamLinks[::3]: | |
linksList.append('https://www.transfermarkt.com' + each['href']) | |
#Set up empty lists that will become our dataframe | |
playerName = [] | |
playerValue = [] | |
playerPosition = [] | |
playerTeam = [] | |
#Go through each team, and get their page ready to scrape player info | |
for team in linksList: | |
teamName = team.split('/')[3] | |
teamPageTree = requests.get(team, headers=headers) | |
teamPageSoup = BeautifulSoup(teamPageTree.content, 'html.parser') | |
teamTable = teamPageSoup.find_all("table", {"class": "items"}) | |
teamRows = teamTable[0].find_all("tr") | |
#Go through each player and get their details, appending them to the relevant list | |
for each in teamRows[1::3]: | |
playerValue.append(each.find_all('td', {'class':'rechts'})[0].get_text().replace('\xa0','')) | |
playerName.append(each.find_all('a', {'class':'spielprofil_tooltip'})[0].get_text()) | |
playerPosition.append(each.find("td")['title']) | |
playerTeam.append(teamName) | |
#Group our lists into a dataframe | |
df = pd.DataFrame({'Player':playerName,'Team':playerTeam, 'Position':playerPosition, 'Value':playerValue}) | |
#Function to parse the weird value format into an integer | |
def calculateValue(value): | |
unit = ''.join(filter(str.isalpha, value)) | |
number = ''.join(filter(str.isnumeric, value)) | |
numValue = 0 | |
if unit == 'm': | |
numValue = int(number) * 10000 | |
elif unit == 'Th': | |
numValue = int(number) * 1000 | |
return numValue | |
#Use this function to create a new column | |
df['NumValue'] = df.apply(lambda x : calculateValue(x['Value']) , axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment