Skip to content

Instantly share code, notes, and snippets.

@dosanjos44
Last active September 13, 2020 16:20
Show Gist options
  • Save dosanjos44/6053d2e187647af57b589ccad212f00d to your computer and use it in GitHub Desktop.
Save dosanjos44/6053d2e187647af57b589ccad212f00d to your computer and use it in GitHub Desktop.
#Import Libraries & set headers
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
#Start on league page & create a list of links to each team
page = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1"
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
teamLinks = pageSoup.find_all('a', class_='vereinprofil_tooltip')[0:60]
#Add domain to links
linksList = []
for each in teamLinks[::3]:
linksList.append('https://www.transfermarkt.com' + each['href'])
#Set up empty lists that will become our dataframe
playerName = []
playerValue = []
playerPosition = []
playerTeam = []
#Go through each team, and get their page ready to scrape player info
for team in linksList:
teamName = team.split('/')[3]
teamPageTree = requests.get(team, headers=headers)
teamPageSoup = BeautifulSoup(teamPageTree.content, 'html.parser')
teamTable = teamPageSoup.find_all("table", {"class": "items"})
teamRows = teamTable[0].find_all("tr")
#Go through each player and get their details, appending them to the relevant list
for each in teamRows[1::3]:
playerValue.append(each.find_all('td', {'class':'rechts'})[0].get_text().replace('\xa0',''))
playerName.append(each.find_all('a', {'class':'spielprofil_tooltip'})[0].get_text())
playerPosition.append(each.find("td")['title'])
playerTeam.append(teamName)
#Group our lists into a dataframe
df = pd.DataFrame({'Player':playerName,'Team':playerTeam, 'Position':playerPosition, 'Value':playerValue})
#Function to parse the weird value format into an integer
def calculateValue(value):
unit = ''.join(filter(str.isalpha, value))
number = ''.join(filter(str.isnumeric, value))
numValue = 0
if unit == 'm':
numValue = int(number) * 10000
elif unit == 'Th':
numValue = int(number) * 1000
return numValue
#Use this function to create a new column
df['NumValue'] = df.apply(lambda x : calculateValue(x['Value']) , axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment