Created
October 2, 2019 17:55
-
-
Save StefanBelo/7098d5f78a214497a1ed6afab7b56f63 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
### Data Acquisition ### | |
def get_data_urls(leagueId, fromYear, toYear): | |
data_urls = [] | |
seasson = fromYear | |
while seasson < toYear: | |
season_id = '{0:02d}{1:02d}'.format(seasson, seasson + 1) | |
url = 'http://www.football-data.co.uk/mmz4281/{0}/{1}.csv'.format(season_id, leagueId) | |
data_urls.append(('S{}'.format(season_id), url)) | |
seasson += 1 | |
return data_urls | |
# Test show data urls | |
data_urls = get_data_urls('E0', 5, 20) | |
for (season_id, url) in data_urls: | |
print('{}: {}'.format(season_id, url)) | |
# Load data | |
def load_football_data(): | |
data_urls = get_data_urls('E0', 5, 20) | |
all_data = pd.DataFrame() | |
for (season_id, url) in data_urls: | |
season_data = pd.read_csv(url) | |
season_data['Season'] = season_id | |
season_data = ( | |
season_data | |
.dropna(axis='columns', thresh=season_data.shape[0]-30) | |
.assign(Date=lambda df: pd.to_datetime(df.Date)) | |
.dropna() | |
) | |
all_data = all_data.append(season_data, sort=True) | |
all_data = all_data.dropna(axis=1).dropna().sort_values(by='Date') | |
return all_data | |
df = load_football_data() | |
### Data Exploration ### | |
print(df.tail(3)) | |
list(df) | |
df = df.drop(columns=['Div']) | |
# Create Home Win, Draw Win and Away Win columns | |
df = df.assign( | |
HomeWin = lambda df: df.apply(lambda row: 1 if row.FTHG > row.FTAG else 0, axis='columns'), | |
Draw = lambda df: df.apply(lambda row: 1 if row.FTHG == row.FTAG else 0, axis='columns'), | |
AwayWin = lambda df: df.apply(lambda row: 1 if row.FTHG < row.FTAG else 0, axis='columns') | |
) | |
winRates = (df.groupby('Season').mean().loc[:, ['HomeWin', 'Draw', 'AwayWin']]) | |
print(winRates) | |
### Findings ### | |
# Set the style | |
plt.style.use('ggplot') | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
homeLine = ax.plot(winRates.HomeWin, label='Home Win Rate') | |
awayLine = ax.plot(winRates.AwayWin, label='Away Win Rate') | |
drawLine = ax.plot(winRates.Draw, label='Draw Win Rate') | |
ax.set_xlabel('Season') | |
ax.set_ylabel('Win Rate') | |
plt.title('Win Rates', fontsize=16) | |
# Add the legend locations | |
home_legend = plt.legend(handles=homeLine, loc='upper right', bbox_to_anchor=(1, 1)) | |
ax = plt.gca().add_artist(home_legend) | |
away_legend = plt.legend(handles=awayLine, loc='center right', bbox_to_anchor=(0.95, 0.4)) | |
ax = plt.gca().add_artist(away_legend) | |
draw_legend = plt.legend(handles=drawLine, loc='center right', bbox_to_anchor=(0.95, 0.06)) | |
# HGA as home win rate - away win rate. | |
home_win_rates = df.groupby(['HomeTeam']).HomeWin.mean() | |
away_win_rates = df.groupby(['AwayTeam']).AwayWin.mean() | |
hga = (home_win_rates - away_win_rates).reset_index().rename(columns={0: 'HGA'}).sort_values(by='HGA', ascending=False) | |
hga.head(10) | |
# Big Clubs | |
big_clubs = ['Liverpool', 'Man City', 'Man United', 'Chelsea', 'Arsenal'] | |
home_win_rates_5 = df[df.HomeTeam.isin(big_clubs)].groupby(['HomeTeam', 'Season']).HomeWin.mean() | |
away_win_rates_5 = df[df.AwayTeam.isin(big_clubs)].groupby(['AwayTeam', 'Season']).AwayWin.mean() | |
hga_top_5 = home_win_rates_5 - away_win_rates_5 | |
hga_top_5.unstack(level=0) | |
sns.lineplot(x='Season', y='HGA', hue='Team', data=hga_top_5.reset_index().rename(columns={0: 'HGA', 'HomeTeam': 'Team'})) | |
plt.legend(loc='lower center', ncol=6, bbox_to_anchor=(0.45, -0.2)) | |
plt.title("HGA Among the top 5 clubs", fontsize=14) | |
plt.show() | |
# Show data distribution | |
for col in df.select_dtypes('number').columns: | |
sns.distplot(df[col]) | |
plt.title(f"Distribution for {col}") | |
plt.show() | |
### Exploring Referee Home Ground Bias ### | |
print('Overall Home Win Rate: {:.4}%'.format(df.HomeWin.mean() * 100)) | |
# Get the top 10 refs based on games | |
top_10_refs = df.Referee.value_counts().head(10).index | |
df[df.Referee.isin(top_10_refs)].groupby('Referee').HomeWin.mean().sort_values(ascending=False) | |
### Variable Correlation With Margin ### | |
df['Margin'] = df['FTHG'] - df['FTAG'] | |
df['HomeWinMargin'] = df['B365H'] - df['B365A'] | |
stat_cols = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTR', 'HY', 'Margin', 'HomeWinMargin'] | |
stat_correlations = df[stat_cols].corr() | |
stat_correlations['Margin'].sort_values() | |
sns.heatmap(stat_correlations, annot=True, annot_kws={'size': 10}) | |
### Save DataFrame ### | |
df.to_csv(r'D:\Projects\Bfexplorer\Machine Learning\Football\Data\football_dataframe.csv', columns=['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HomeWin', 'Draw', 'AwayWin', 'HomeWinMargin'], index=None) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment