Skip to content

Instantly share code, notes, and snippets.

@StefanBelo
Created October 2, 2019 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save StefanBelo/7098d5f78a214497a1ed6afab7b56f63 to your computer and use it in GitHub Desktop.
Save StefanBelo/7098d5f78a214497a1ed6afab7b56f63 to your computer and use it in GitHub Desktop.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
### Data Acquisition ###
def get_data_urls(leagueId, fromYear, toYear):
data_urls = []
seasson = fromYear
while seasson < toYear:
season_id = '{0:02d}{1:02d}'.format(seasson, seasson + 1)
url = 'http://www.football-data.co.uk/mmz4281/{0}/{1}.csv'.format(season_id, leagueId)
data_urls.append(('S{}'.format(season_id), url))
seasson += 1
return data_urls
# Test show data urls
data_urls = get_data_urls('E0', 5, 20)
for (season_id, url) in data_urls:
print('{}: {}'.format(season_id, url))
# Load data
def load_football_data():
data_urls = get_data_urls('E0', 5, 20)
all_data = pd.DataFrame()
for (season_id, url) in data_urls:
season_data = pd.read_csv(url)
season_data['Season'] = season_id
season_data = (
season_data
.dropna(axis='columns', thresh=season_data.shape[0]-30)
.assign(Date=lambda df: pd.to_datetime(df.Date))
.dropna()
)
all_data = all_data.append(season_data, sort=True)
all_data = all_data.dropna(axis=1).dropna().sort_values(by='Date')
return all_data
df = load_football_data()
### Data Exploration ###
print(df.tail(3))
list(df)
df = df.drop(columns=['Div'])
# Create Home Win, Draw Win and Away Win columns
df = df.assign(
HomeWin = lambda df: df.apply(lambda row: 1 if row.FTHG > row.FTAG else 0, axis='columns'),
Draw = lambda df: df.apply(lambda row: 1 if row.FTHG == row.FTAG else 0, axis='columns'),
AwayWin = lambda df: df.apply(lambda row: 1 if row.FTHG < row.FTAG else 0, axis='columns')
)
winRates = (df.groupby('Season').mean().loc[:, ['HomeWin', 'Draw', 'AwayWin']])
print(winRates)
### Findings ###
# Set the style
plt.style.use('ggplot')
fig = plt.figure()
ax = fig.add_subplot(111)
homeLine = ax.plot(winRates.HomeWin, label='Home Win Rate')
awayLine = ax.plot(winRates.AwayWin, label='Away Win Rate')
drawLine = ax.plot(winRates.Draw, label='Draw Win Rate')
ax.set_xlabel('Season')
ax.set_ylabel('Win Rate')
plt.title('Win Rates', fontsize=16)
# Add the legend locations
home_legend = plt.legend(handles=homeLine, loc='upper right', bbox_to_anchor=(1, 1))
ax = plt.gca().add_artist(home_legend)
away_legend = plt.legend(handles=awayLine, loc='center right', bbox_to_anchor=(0.95, 0.4))
ax = plt.gca().add_artist(away_legend)
draw_legend = plt.legend(handles=drawLine, loc='center right', bbox_to_anchor=(0.95, 0.06))
# HGA as home win rate - away win rate.
home_win_rates = df.groupby(['HomeTeam']).HomeWin.mean()
away_win_rates = df.groupby(['AwayTeam']).AwayWin.mean()
hga = (home_win_rates - away_win_rates).reset_index().rename(columns={0: 'HGA'}).sort_values(by='HGA', ascending=False)
hga.head(10)
# Big Clubs
big_clubs = ['Liverpool', 'Man City', 'Man United', 'Chelsea', 'Arsenal']
home_win_rates_5 = df[df.HomeTeam.isin(big_clubs)].groupby(['HomeTeam', 'Season']).HomeWin.mean()
away_win_rates_5 = df[df.AwayTeam.isin(big_clubs)].groupby(['AwayTeam', 'Season']).AwayWin.mean()
hga_top_5 = home_win_rates_5 - away_win_rates_5
hga_top_5.unstack(level=0)
sns.lineplot(x='Season', y='HGA', hue='Team', data=hga_top_5.reset_index().rename(columns={0: 'HGA', 'HomeTeam': 'Team'}))
plt.legend(loc='lower center', ncol=6, bbox_to_anchor=(0.45, -0.2))
plt.title("HGA Among the top 5 clubs", fontsize=14)
plt.show()
# Show data distribution
for col in df.select_dtypes('number').columns:
sns.distplot(df[col])
plt.title(f"Distribution for {col}")
plt.show()
### Exploring Referee Home Ground Bias ###
print('Overall Home Win Rate: {:.4}%'.format(df.HomeWin.mean() * 100))
# Get the top 10 refs based on games
top_10_refs = df.Referee.value_counts().head(10).index
df[df.Referee.isin(top_10_refs)].groupby('Referee').HomeWin.mean().sort_values(ascending=False)
### Variable Correlation With Margin ###
df['Margin'] = df['FTHG'] - df['FTAG']
df['HomeWinMargin'] = df['B365H'] - df['B365A']
stat_cols = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTR', 'HY', 'Margin', 'HomeWinMargin']
stat_correlations = df[stat_cols].corr()
stat_correlations['Margin'].sort_values()
sns.heatmap(stat_correlations, annot=True, annot_kws={'size': 10})
### Save DataFrame ###
df.to_csv(r'D:\Projects\Bfexplorer\Machine Learning\Football\Data\football_dataframe.csv', columns=['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HomeWin', 'Draw', 'AwayWin', 'HomeWinMargin'], index=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment