Created
October 13, 2022 15:34
-
-
Save sertalpbilal/a8273d63e55a328220adf2b75027194f to your computer and use it in GitHub Desktop.
FPL Data Prep (Convert any type of prediction data into FPLReview format)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pathlib | |
import os | |
import json | |
from unicodedata import combining, normalize | |
import requests | |
from fuzzywuzzy import fuzz | |
import numpy as np | |
from abc import ABC, abstractmethod | |
class DataSource(ABC): | |
def __init__(self, source): | |
self.source = source | |
@abstractmethod | |
def convert(self, target): | |
pass | |
# To remove accents in names | |
def fix_name_dialect(name): | |
new_name = ''.join([c for c in normalize('NFKD', name) if not combining(c)]) | |
return new_name.replace('Ø', 'O').replace('ø', 'o').replace('ã', 'a') | |
def get_best_score(r): | |
return max(r['wn_score'], r['cn_score']) | |
class Review(DataSource): | |
def __init__(self, source): | |
super().__init__(source) | |
def convert(self, target=None): | |
df = pd.read_csv(index_col=0) | |
if target: | |
df.to_csv(target) | |
return df | |
class Mikkel(DataSource): | |
def __init__(self, source): | |
super().__init__(source) | |
# To add FPL ID column to Mikkel's data and clean empty rows | |
def fix_mikkel(self): | |
df = pd.read_csv(self.source, encoding='latin1') | |
remove_accents = fix_name_dialect | |
r = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/") | |
players = r.json()['elements'] | |
mikkel_team_dict = { | |
'BHA': 'BRI', | |
'CRY': 'CPL', | |
'NFO': 'NOT', | |
'SOU': 'SOT', | |
'WHU': 'WHM' | |
} | |
teams = r.json()['teams'] | |
for t in teams: | |
t['mikkel_short'] = mikkel_team_dict.get(t['short_name'], t['short_name']) | |
df['BCV_numeric'] = pd.to_numeric(df[' BCV '], errors='coerce') | |
df_cleaned = df[~((df['Player'] == '0') | (df['No.'].isnull()) | (df['BCV_numeric'].isnull()) | (df['No.'].isnull()))].copy() | |
print(len(df), len(df_cleaned)) | |
df_cleaned['Clean_Name'] = df_cleaned['Player'].apply(remove_accents) | |
df_cleaned.head() | |
mikkel_team_fix = {'WHU': 'WHM'} | |
df_cleaned['Team'] = df_cleaned['Team'].replace(mikkel_team_fix) | |
df_cleaned['Position'] = df_cleaned['Position'].replace({'GK': 'G'}) | |
element_type_dict = {1: 'G', 2: 'D', 3: 'M', 4: 'F'} | |
team_code_dict = {i['code']: i for i in teams} | |
player_names = [{ | |
'id': e['id'], | |
'web_name': e['web_name'], | |
'combined': e['first_name'] + ' ' + e['second_name'], | |
'team': team_code_dict[e['team_code']]['mikkel_short'], | |
'position': element_type_dict[e['element_type']], | |
} for e in players] | |
for target in player_names: | |
target['wn'] = remove_accents(target['web_name']) | |
target['cn'] = remove_accents(target['combined']) | |
entries = [] | |
for player in df_cleaned.iloc: | |
possible_matches = [i for i in player_names if i['team'] == player['Team'] and i['position'] == player['Position']] | |
for target in possible_matches: | |
p = player['Clean_Name'] | |
target['wn_score'] = fuzz.token_set_ratio(p,target['wn']) | |
target['cn_score'] = fuzz.token_set_ratio(p,target['cn']) | |
best_match = max(possible_matches, key=get_best_score) | |
entries.append({'player_input': player['Player'], 'team_input': player['Team'], 'position_input': player['Position'], **best_match}) | |
# print(player['Player'], player['Team'], best_match) | |
entries_df = pd.DataFrame(entries) | |
entries_df['name_team'] = entries_df['player_input'] + ' @ ' + entries_df['team_input'] | |
entry_dict = entries_df.set_index('name_team')['id'].to_dict() | |
df_cleaned['name_team'] = df_cleaned['Player'] + ' @ ' + df_cleaned['Team'] | |
df_cleaned['FPL ID'] = df_cleaned['name_team'].map(entry_dict) | |
existing_ids = df_cleaned['FPL ID'].tolist() | |
missing_players = [] | |
for p in players: | |
if p['id'] in existing_ids: | |
continue | |
missing_players.append({ | |
'Position': element_type_dict[p['element_type']], | |
'Player': p['web_name'], | |
' Price ': p['now_cost'] / 10, | |
'FPL ID': p['id'], | |
' Weighted minutes ': 0 | |
}) | |
df_full = pd.concat([df_cleaned, pd.DataFrame(missing_players)]).fillna(0) | |
return df_full | |
# To convert cleaned Mikkel data into Review format | |
def convert(self, target=None): | |
# Read and add ID column | |
raw_data = self.fix_mikkel() | |
static_url = 'https://fantasy.premierleague.com/api/bootstrap-static/' | |
r = requests.get(static_url).json() | |
teams = r['teams'] | |
new_names = {i: i.strip() for i in raw_data.columns} | |
raw_data.rename(columns=new_names, inplace=True) | |
df_clean = raw_data[raw_data['Price'] < 20].copy() | |
df_clean['Weighted minutes'].fillna('90', inplace=True) | |
df_clean['review_id'] = df_clean['FPL ID'].astype(int) | |
pos_fix = {'GK': 'G'} | |
df_clean['Pos'] = df_clean['Position'] | |
df_clean['Pos'] = df_clean['Pos'].replace(pos_fix) | |
df_clean.loc[df_clean['Pos'].isin(['G', 'D']), 'Weighted minutes'] = '90' | |
gws = [] | |
for i in df_clean.columns: | |
try: | |
int(i) | |
df_clean[f'{i}_Pts'] = df_clean[i].str.strip().replace({'-': 0}).astype(float) | |
df_clean[f'{i}_xMins'] = df_clean['Weighted minutes'].str.strip().replace({'-': 0}).astype(float).replace({np.nan: 0}) | |
gws.append(i) | |
except: | |
continue | |
df_clean['Name'] = df_clean['Player'] | |
df_clean['Value'] = df_clean['Price'] | |
df_final = df_clean[['review_id', 'Name', 'Pos', 'Value'] + [f'{gw}_{tag}' for gw in gws for tag in ['Pts', 'xMins']]].copy() | |
df_final.replace({'-': 0}, inplace=True) | |
elements_data = r['elements'] | |
player_ids = [i['id'] for i in elements_data] | |
player_names = {i['id']: i['web_name'] for i in elements_data} | |
player_pos = {i['id']: i['element_type'] for i in elements_data} | |
player_price = {i['id']: i['now_cost']/10 for i in elements_data} | |
pos_no = {1: 'G', 2: 'D', 3: 'M', 4: 'F'} | |
values = [] | |
existing_players = df_final['review_id'].to_list() | |
for i in player_ids: | |
if i not in existing_players: | |
entry = {'review_id': i, 'Name': player_names[i], 'Pos': pos_no[player_pos[i]], 'Value': player_price[i], **{f'{gw}_{tag}': 0 for gw in gws for tag in ['Pts', 'xMins']}} | |
values.append(entry) | |
team_data = teams | |
team_dict = {i['code']: i['name'] for i in team_data} | |
player_teams = {i['id']: team_dict[i['team_code']] for i in elements_data} | |
# Add missing players | |
# df_final = pd.concat([df_final, pd.DataFrame(values, columns=df_final.columns)], ignore_index=True) | |
df_final['Team'] = df_final['review_id'].map(player_teams) | |
df_final['fpl_id'] = df_final['review_id'] | |
df_final['Name'] = df_final['review_id'].replace(player_names) | |
df_final.set_index('fpl_id', inplace=True) | |
df_final.fillna(0, inplace=True) | |
# df_final.to_csv(f'../data/mikkel.csv') | |
if target is not None: | |
df_final.to_csv(target) | |
return df_final | |
class Kiwi(DataSource): | |
def __init__(self, source): | |
super().__init__(source) | |
def convert(self, target=None): | |
df = pd.read_csv(self.source, index_col=0) | |
df['id'] = df['ID'] | |
pos = 0 | |
min_gw = 0 | |
for c in df.columns: | |
try: | |
min_gw = int(c) | |
break | |
except: | |
pos += 1 | |
continue | |
last_gw = 38 | |
col_count = last_gw - min_gw + 1 | |
df.iloc[:, pos:38-pos+1] | |
main_values = df.iloc[:, 0:5].copy() | |
main_values['Pos'] = main_values['Pos'].str[0] | |
main_values['Value'] = main_values['Price'] | |
# main_values.columns= main_values.columns.str.lower() | |
xmin_values = df.iloc[:, pos:pos+col_count].copy() | |
xmin_values = xmin_values.add_suffix("_xMins") | |
xp_start = list(df).index('xPts') | |
xp_values = df.iloc[:, xp_start+1:xp_start+col_count+1] | |
xp_values.columns = [i[0] for i in xp_values.columns.str.split('.')] | |
xp_values = xp_values.add_suffix("_Pts") | |
r = requests.get('https://fantasy.premierleague.com/api/bootstrap-static/').json() | |
elements = r['elements'] | |
name_dict = {i['id']: i['web_name'] for i in elements} | |
main_values['Name'] = main_values['ID'].map(name_dict) | |
kiwi_data = pd.concat([main_values, xmin_values, xp_values], axis=1) | |
kiwi_data['id'] = kiwi_data['ID'] | |
kiwi_data.set_index('ID', inplace=True) | |
kiwi_data.sort_index(inplace=True) | |
kiwi_data.reset_index(drop=True, inplace=True) | |
if target: | |
kiwi_data.to_csv(target) | |
return kiwi_data | |
class Scout(DataSource): | |
def __init__(self, source): | |
super().__init__(source) | |
def convert(self, target=None): | |
raw_data = pd.read_csv(self.source, index_col=0) | |
r = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/") | |
scout_team_dict = { | |
'Man City': 'MCI', | |
'Liverpool': 'LIV', | |
'Tottenham': 'TOT', | |
'Chelsea': 'CHE', | |
'Man Utd': 'MUN', | |
'Fulham': 'FUL', | |
'Crystal Palace': 'CRY', | |
'West Ham': 'WHU', | |
'Leicester': 'LEI', | |
'Arsenal': 'ARS', | |
'Brentford': 'BRE', | |
'Leeds': 'LEE', | |
'Newcastle': 'NEW', | |
'Everton': 'EVE', | |
'Southampton': 'SOU', | |
'Wolves': 'WOL', | |
'Brighton': 'BHA', | |
'Aston Villa': 'AVL', | |
'Bournemouth': 'BOU', | |
'Nottingham Forest': 'NFO' | |
} | |
raw_data['team_short'] = raw_data['Team'].map(scout_team_dict) | |
raw_data['clean_name'] = raw_data['Player'].apply(fix_name_dialect) | |
raw_data['Pos'] = raw_data['Pos'].replace({"GK": "G"}) | |
players = r.json()['elements'] | |
element_type_dict = {1: 'G', 2: 'D', 3: 'M', 4: 'F'} | |
teams = r.json()['teams'] | |
team_code_dict = {i['code']: i for i in teams} | |
player_names = [{ | |
'id': e['id'], | |
'web_name': e['web_name'], | |
'combined': e['first_name'] + ' ' + e['second_name'], | |
'team': team_code_dict[e['team_code']]['short_name'], | |
'position': element_type_dict[e['element_type']], | |
} for e in players] | |
for target in player_names: | |
target['wn'] = fix_name_dialect(target['web_name']) | |
target['cn'] = fix_name_dialect(target['combined']) | |
entries = [] | |
for player in raw_data.iloc: | |
possible_matches = [i for i in player_names if i['team'] == player['team_short'] and i['position'] == player['Pos']] | |
for target in possible_matches: | |
# p = unicodedata.normalize('NFD', player['Player']).encode('ascii', 'ignore').decode("utf-8") + ' ' + player['Team'] | |
p = player['clean_name'] | |
target['wn_score'] = fuzz.token_set_ratio(p,target['wn']) | |
target['cn_score'] = fuzz.token_set_ratio(p,target['cn']) | |
if len(possible_matches) == 0: | |
print("No matches for ", player) | |
best_match = max(possible_matches, key=get_best_score) | |
entries.append({'player_input': player['Player'], 'team_input': player['team_short'], 'position_input': player['Pos'], **best_match}) | |
entries_df = pd.DataFrame(entries) | |
entries_df['name_team'] = entries_df['player_input'] + ' @ ' + entries_df['team_input'] | |
entry_dict = entries_df.set_index('name_team')['id'].to_dict() | |
raw_data['name_team'] = raw_data['Player'] + ' @ ' + raw_data['team_short'] | |
raw_data['review_id'] = raw_data['name_team'].map(entry_dict) | |
raw_data.dropna(subset=['review_id'], inplace=True) | |
raw_data['review_id'] = raw_data['review_id'].astype(int) | |
raw_data['Pos'] = raw_data['Pos'].str[0] | |
raw_data['Value'] = raw_data['Price'].str.replace('m', '').astype(float) | |
raw_data['Name'] = raw_data['Player'] | |
raw_data['pid'] = raw_data['review_id'] | |
def scout_fix_col_name(col): | |
if 'GW' in col and '_' not in col: | |
return col.split('GW')[1] + "_Pts" | |
elif '_xMin' in col: | |
return col.split('GW')[1] + 's' | |
else: | |
return col | |
key_names = {i: scout_fix_col_name(i) for i in raw_data.keys()} | |
raw_data.rename(columns=key_names, inplace=True) | |
raw_data.set_index('pid', inplace=True) | |
raw_data.sort_values(by='review_id', inplace=True) | |
raw_data['id'] = raw_data['review_id'] | |
if target: | |
raw_data.to_csv(target) | |
return raw_data | |
class Fix(DataSource): | |
def __init__(self, source): | |
super().__init__(source) | |
def convert(self, target=None): | |
raw_data = pd.read_csv(self.source, index_col=0) | |
def fffix_fix_col_name(col): | |
if "pts_" in col: | |
return col.replace("pts_", "") + "_Pts" | |
elif col == 'price': | |
return "Value" | |
elif col == 'name': | |
return "Name" | |
elif col == 'id': | |
return 'fix_id' | |
elif col == 'team': | |
return 'Team' | |
else: | |
return col | |
raw_data['review_id'] = raw_data.index + 1 | |
for i in range(1,39): | |
raw_data[f'{i}_xMins'] = 90 | |
pos_no = {1: 'G', 2: 'D', 3: 'M', 4: 'F'} | |
static_url = 'https://fantasy.premierleague.com/api/bootstrap-static/' | |
r = requests.get(static_url).json() | |
elements_data = r['elements'] | |
type_dict = {i['id']: pos_no[i['element_type']] for i in elements_data} | |
raw_data['Pos'] = raw_data['review_id'].map(type_dict) | |
key_names = {i: fffix_fix_col_name(i) for i in raw_data.keys()} | |
raw_data.rename(columns=key_names, inplace=True) | |
raw_data['id'] = raw_data['review_id'].astype(int) | |
picked_columns = ['id', 'Name', 'Value', 'Pos', 'Team'] + [i for i in raw_data.columns if '_Pts' in i] + [i for i in raw_data.columns if '_xMins' in i] | |
summary_data = raw_data[picked_columns].copy() | |
summary_data.sort_values(by='id', inplace=True) | |
if target: | |
summary_data.to_csv(target) | |
return summary_data | |
class Hub(DataSource): | |
def __init__(self, source): | |
super().__init__(source) | |
def convert(self, target=None): | |
raw_data = pd.read_csv(self.source, index_col=0) | |
raw_data['review_id'] = raw_data['fpl_id'] | |
pos_no = {1: 'G', 2: 'D', 3: 'M', 4: 'F'} # 'G': 0, 'D': 1, 'M': 2, 'F': 3} | |
raw_data['Pos'] = raw_data['position_id'].map(pos_no) | |
static_url = 'https://fantasy.premierleague.com/api/bootstrap-static/' | |
r = requests.get(static_url).json() | |
elements_data = r['elements'] | |
team_data = r['teams'] | |
player_prices = {i['id']: i['now_cost']/10 for i in elements_data} | |
team_dict = {i['code']: i['name'] for i in team_data} | |
player_teams = {i['id']: team_dict[i['team_code']] for i in elements_data} | |
raw_data['Value'] = raw_data['fpl_id'].map(player_prices) | |
raw_data['Team'] = raw_data['fpl_id'].map(player_teams) | |
def hub_fix_col_name(col): | |
if "_" in col: | |
return col.replace("_pts", "_Pts").replace("_xmins", "_xMins") | |
elif col == 'player': | |
return "Name" | |
else: | |
return col | |
key_names = {i: hub_fix_col_name(i) for i in raw_data.keys()} | |
raw_data.rename(columns=key_names, inplace=True) | |
raw_data.set_index('fpl_id', inplace=True) | |
raw_data['id'] = raw_data['review_id'].astype(int) | |
raw_data.sort_values(by='id', inplace=True) | |
if target: | |
raw_data.to_csv(target) | |
return raw_data | |
if __name__ == "__main__": | |
pass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment