Skip to content

Instantly share code, notes, and snippets.

@natarajanc
Created April 19, 2013 06:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save natarajanc/5418566 to your computer and use it in GitHub Desktop.
Save natarajanc/5418566 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <headingcell level=2>
# Soccer 101 - Player Positions
# <codecell>
from IPython.core.display import Image
Image('http://www.soccer-training-guide.com/images/wikipedia_positions.png')
# <headingcell level=3>
# Parse Players data
# <codecell>
def get_players_data(players, rows):
'''helper function to parse player data on who were transferred in the English premier league '''
for row in rows:
players_data = { }
for (i,item) in enumerate(row.find_all('td')):
#print i, item.get_text()
if i == 3:
players_data['name'] = item.get_text()
parts = item.find('a').get('href').split('/')
players_data['handle'] = parts[2]
players_data['id'] = parts[4].split('_')[1].split('.')[0]
if i == 4:
players_data['club'] = item.get_text()
if i == 5:
players_data['age'] = item.get_text()
if i == 7:
players_data['position'] = item.get_text()
if i == 9:
players_data['from'] = item.get_text()
if i == 11:
players_data['to'] = item.get_text()
if i == 12:
fee = item.get_text().split(' ')
players_data['transfer_fee_pounds'] = fee[0]
players_data['transfer_fee_euros'] = fee[1][1:]
players.append(players_data)
# <headingcell level=3>
# Fetch Players data (transfers in a given year)
# <codecell>
def fetch_data(players, year):
''' given a particular year , fetch transferred player data from English premier league'''
url = 'http://www.transfermarkt.co.uk/en/premier-league/transferrekorde/wettbewerb_GB1_%s_default_default_default_alle.html' % (year)
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
table= soup.find('table' , {'class':"tabelle_grafik"})
rows = table.find_all('tr' , {'class':"hell"})
get_players_data(players, rows)
rows = table.find_all('tr' , {'class':"dunkel"})
get_players_data(players, rows[1:])
# <headingcell level=2>
# Get player performance data for a particular season in EPL
# <codecell>
def extend_data(players, new_players, year):
for player in players:
url = 'http://www.transfermarkt.co.uk/en/%s/leistungsdaten/spieler_%s_%s.html' % (player['handle'], player['id'], year)
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
table = soup.find('table' , {'class':"standard_tabelle"})
row = table.find('tr' , {'class':"hell"}) #Assuming its in first row
for (i,item) in enumerate(row.find_all('td')):
#print i, item.get_text()
if i == 2:
if item.get_text() != '-':
player['matches'] = item.get_text()
else:
player['matches'] = 0
if i == 3:
if item.get_text() != '-':
player['goals'] = item.get_text()
else:
player['goals'] = 0
if i == 5:
if item.get_text() != '-':
player['assists'] = item.get_text()
else:
player['assists'] = 0
if i == 11:
if item.get_text() != '-':
player['minutes_per_goal'] = item.get_text()
else:
player['minutes_per_goal'] = 0
if i == 12:
if item.get_text() != '-':
player['minutes'] = item.get_text()
else:
player['minutes'] = 0
new_players.append(player)
# <headingcell level=2>
# Test code to dump player data by season onto files
# <codecell>
import pickle,requests
from bs4 import BeautifulSoup
import requests , re
players = []
year = '2010'
fetch_data(players, year)
new_players = []
extend_data(players, new_players, year)
print new_players[0]
#pickle.dump( new_players, open( "players_data_2010.pkl", "wb" ) )
# <headingcell level=2>
# Load players dataframe
# <codecell>
import numpy as np
import pandas as pd
import pickle,matplotlib.pyplot as plt
from pandas import Series, DataFrame
f = open('players_data_2010.pkl')
p = pickle.load(f)
player_df_2010 = DataFrame(p)
player_df_2010['season']='10/11'
f = open('players_data_2012.pkl')
p = pickle.load(f)
player_df_2012 = DataFrame(p)
player_df_2012['season']='12/13'
f = open('players_data.pkl')
p = pickle.load(f)
player_df_2011 = DataFrame(p)
player_df_2011['season']='11/12'
print player_df_2011.columns
player_df = pd.concat([player_df_2010,player_df_2011,player_df_2012])
player_df['transfer_fee_euros'] = player_df.transfer_fee_euros.str.replace('.','')
player_df['transfer_fee_euros'] =player_df.transfer_fee_euros.astype(float64)
player_df['minutes'] = player_df.minutes.astype(float64)
player_df['goals'] = player_df.goals.astype(float64)
player_df['age'] = player_df.age.astype(float64)
player_df['assists'] = player_df.assists.astype(float64)
player_df.set_index(['id'], inplace=True, drop=False)
player_df['money_by_minutes']= player_df['transfer_fee_euros']/player_df['minutes'].astype(float64)
player_df['money_by_goal_minutes'] = player_df['transfer_fee_euros']/player_df['minutes_per_goal'].astype(float64)
player_df['money_by_assists'] = player_df['transfer_fee_euros']/player_df['assists'].astype(float64)
player_df['money_by_goals'] = player_df['transfer_fee_euros']/player_df['goals'].astype(float64)
player_df['money_by_matches'] = player_df['transfer_fee_euros']/player_df['matches'].astype(float64)
epl_teams = [ u'Southampton', u'Aston Villa', u'Man Utd', u'Swansea', u'Liverpool', u'Blackburn', u'Everton', u'Fulham', u'Newcastle', u'West Ham', u'QPR', u'Wolves', u'Chelsea', u'Spurs', u'Man City', u'Stoke City', u'Arsenal', u'Sunderland', u'Wigan', u'Birmingham', u'West Brom']
epl_to=[]
for team in epl_teams:
epl_to.append( player_df[player_df.to == team])
player_df = pd.concat(epl_to)
print len(player_df)
player_df = player_df.sort_index(by='season')
print player_df
# get unique players who have been transferred more than once across seasons from 2010 to 2013
transfr_more_than_once = player_df.ix[player_df.id.value_counts() > 1 ].sort_index(by=['id','season'])['id'].unique()
# list of all players who have been transferred more than once from 2010 -2013
#print player_df.ix[transfr_more_than_once].sort_index(by=['id','season'])[['name','from','to','transfer_fee_euros','season']]
#[['name','position','from','to','transfer_fee_euros','season']]
# graph player transfer amount rise/fall through the 3 seasons.
def transfr_amt_by_season(p_id):
return player_df[(player_df.id==p_id)][['season','transfer_fee_euros']].set_index(keys='season')
name_df = DataFrame(index=['10/11','11/12','12/13'])
for item in np.unique(transfr_more_than_once):
#print item
#print np.unique(player_df.ix[item]['name'])
name_df[np.unique(player_df.ix[item]['name'])] = transfr_amt_by_season(item)
name_df= name_df.where(pd.notnull(name_df),None)
#print name_df
#print name_df.plot(figsize=(10,7))
#########
#Money spent each season by clubs on player acquisitions
grouped = player_df[player_df.season=='11/12'].groupby(['to'])
# money paid per minute on field time
#(grouped['transfer_fee_euros'].sum()/ grouped['minutes'].sum()).plot(figsize=(10,10) ,kind='barh',rot=0)
# curious observation for Loko Moscow - seems they paid their player 9mil and he played for only 131 minutes
# print player_df.ix[grouped.groups['Loko Moscow']][['season','name','to','transfer_fee_euros','minutes']]
#print grouped.aggregate(np.sum)[['money_by_minutes']].plot()
#print grouped['money_by_minutes'].mean().plot(figsize(10,10),kind='barh' , rot=0)
#grouped.groups.plot()
####
def compare_stat(col):
''' takes column name from players_df dataframe and computes transfr_amount/col name '''
#col = 'minutes'
#Money spent each season by clubs on player acquisitions
grouped = player_df[player_df.season=='10/11'].groupby(['to'])
# money paid per minute on field time
t1 = (grouped['transfer_fee_euros'].sum()/ grouped[col].sum())
t1.name = '10/11'
t1 = DataFrame(t1)
grouped = player_df[player_df.season=='11/12'].groupby(['to'])
t2 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum()
t2.name='11/12'
grouped = player_df[player_df.season=='12/13'].groupby(['to'])
t3 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum()
t3.name='12/13'
t1= t1.join(t2)
t1= t1.join(t3)
t1= t1.where(pd.notnull(t1),0)
#print t1
fig = plt.figure()
fig.set_size_inches(20,20)
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
i=1
for seasons in t1.columns:
print seasons
x = fig.add_subplot(2,2,i)
t1[seasons].plot(figsize(5,5),kind='barh', title='season '+ seasons+ ':fee by ' + col , rot=0)
i+=1
fig
t1= t1.transpose()
t1.plot(figsize(15,15))
#for seasons in
#t1['10/11'].plot(figsize(10,10),kind='barh', rot=0)
#print player_df.ix['22491'] # 5 million paid by man utd , on bench for 2 matches - easy money!
compare_stat('minutes')
compare_stat('goals')
grouped = player_df[player_df.season=='11/12'].groupby(['to','age'])
#print grouped.groups
#print player_df.ix['4063']
(grouped['transfer_fee_euros'].sum()/ grouped['goals'].sum()).plot(figsize=(10,10) ,kind='barh',rot=0)
#name_df['Raul'] = name_sex_count_in_year('Raymond','M')
#player_df[player_df[player_df.id.value_counts() > 1].isin([True, False])]
#player_df['name'][player_df.groupby('id').size() > 1]
# <codecell>
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
#player_df.transfer_fee_euros.type
#player_df.position
print len(player_df)
player_df.sort_index(by='transfer_fee_euros')[::][["name", "club",'transfer_fee_euros','from','to']]
grouped= player_df.groupby('club').size()
#grouped.order(ascending=True).plot()
club_counts = player_df['club'].value_counts()
#print club_counts
#club_counts.plot(kind='barh', rot=0)
pos_counts = player_df['position'].value_counts() # players acquired by position - raw count
print pos_counts
#player_df.groupby('position')['transfer_fee_euros'].sum().plot(title='total price by position')
position_compare = DataFrame()
#print player_df.season.unique()
pieces = DataFrame()
for season in player_df.season.unique():
p1 = player_df[player_df.season==season].groupby(['position'])['transfer_fee_euros'].mean()
p1.name = season
if season == '10/11':
position_compare = DataFrame(p1)
else:
position_compare = position_compare.join(p1)
position_compare= position_compare.where(pd.notnull(position_compare),0)
#print position_compare
print position_compare.plot(kind='barh')
#player_df[player_df.season=='10/11'].groupby(['position'])['transfer_fee_euros'].mean().plot(title='avg price by position' , kind='barh', rot=0) # shows high price paid for Strikers CF
#pos_counts.plot(kind='barh', rot=0)
#player_df['transfer_fee_euros'].hist() # distribution of money spent
# <codecell>
player_df.groupby('to').groups.keys()
# <codecell>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment