Created
October 2, 2020 18:06
-
-
Save finlytics-hub/18ca52500cebb77760798ae592629196 to your computer and use it in GitHub Desktop.
ODI: Feature Engineering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create a new Match Month column | |
final_data['Match Month'] = final_data['Match Date'].dt.month | |
## win/loss ratio | |
# instantiate an empty list | |
win_loss_ratio = [] | |
# loop over the final_data DF for each playing Country in it | |
for c in final_data['Country'].unique(): | |
# slice all matches with 'c' country in the 'Country' column | |
df1 = final_data[final_data['Country'] == c].copy() | |
# calculate the number of matches won by 'c' up to, but not including, the current match's result | |
df1['Wins Count'] = ((df1['Result'] == 'Won') & (df1['Country'] == c)).shift(fill_value = 0).cumsum() | |
# calculate the number of matches lost by 'c' up to, but not including, the current match's result | |
df1['Losses Count'] = ((df1['Result'] == 'Lost') & (df1['Country'] == c)).shift(fill_value = 0).cumsum() | |
# append the resulting df to the win_loss_ratio list | |
win_loss_ratio.append(df1) | |
# concatenate the list in to a df | |
final_data = pd.concat(win_loss_ratio) | |
# convert the 2 new columns to numeric | |
final_data['Wins Count'] = pd.to_numeric(final_data['Wins Count'], downcast = 'integer') | |
final_data['Losses Count'] = pd.to_numeric(final_data['Losses Count'], downcast = 'integer') | |
# sort | |
final_data.sort_values(by = 'Match Date', inplace = True, ignore_index = True) | |
# calculate Win/Loss Ratio and replace 'inf' with 0 ('inf' would be the result of DivisionByZero error) | |
final_data['Win/Loss Ratio'] = (final_data['Wins Count'] / final_data['Losses Count']).replace(to_replace = {np.inf: 0, -np.inf: 0, np.nan: 0}) | |
# drop the now redundant 2 columns | |
final_data.drop(columns = ['Wins Count', 'Losses Count'], inplace = True) | |
# calculate batting/bowling averages and RPO | |
# work on a backup of the original DF | |
ODI_innings_for_averages = ODI_innings.copy() | |
# drop columns not required to calculate the 4 new features | |
ODI_innings_for_averages.drop(columns = ['Innings Player', 'Innings Runs Scored', 'Innings Minutes Batted', 'Innings Batted Flag', 'Innings Not Out Flag', 'Innings Boundary Fours', 'Innings Boundary Sixes', 'Innings Batting Strike Rate', 'Innings Number', '50\'s', '100\'s', 'Innings Runs Scored Buckets', 'Innings Bowled Flag', 'Innings Maidens Bowled', '4 Wickets', '5 Wickets', '10 Wickets', 'Innings Wickets Taken Buckets', 'Innings Economy Rate'], inplace = True) | |
# remove 'v' from the Opposition column | |
ODI_innings_for_averages['Opposition'] = ODI_innings_for_averages['Opposition'].str.replace('v ', '') | |
# replace certain other object dtypes data in the DF | |
ODI_innings_for_averages.replace(to_replace = {'-': 0, 'DNB': 0, 'TDNB': 0, 'sub': 0}, inplace = True) | |
# convert the relevant columns to numeric | |
col_to_convert = ['Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken'] | |
ODI_innings_for_averages[col_to_convert] = ODI_innings_for_averages[col_to_convert].apply(pd.to_numeric) | |
# convert 'Innings Date' to datetime | |
ODI_innings_for_averages['Innings Date'] = pd.to_datetime(ODI_innings_for_averages['Innings Date']) | |
# define a function to calculate wickets lost using info from 'Innings Wickets Taken' | |
def wkts_lost(country, wickets_taken): | |
# instantiate an empty dictionary | |
wkt_lost_dict = {} | |
# append the name of the country (which will in fact be the name in the Opposition column) to the Country key | |
wkt_lost_dict.setdefault('Country', []).append(country) | |
# append the integer in Innings Wickets Taken to Innings Wicket Lost key | |
wkt_lost_dict.setdefault('Innings Wickets Lost', []).append(wickets_taken) | |
# convert the dictionary to a DF | |
wkt_lost_df = pd.DataFrame(wkt_lost_dict) | |
# return the DF | |
return wkt_lost_df | |
# instantiate an empty list | |
wkts_taken_list = [] | |
# next we loop over all records and store the wickets taken by the opposition to wickets lost by a team | |
for date in ODI_innings_for_averages['Innings Date'].unique(): | |
# select all records for this date | |
df1 = ODI_innings_for_averages[ODI_innings_for_averages['Innings Date'] == date] | |
# group by on Country and Opposition and summ all the other columns | |
grouped = df1.groupby(by = ['Country', 'Opposition']).sum().reset_index() | |
# list comprehension to apply our wkts_lost function on each row of the grouped dataframe. Then convert to a dataframe through pd.concat | |
result = pd.concat([wkts_lost(x, y) for x, y in zip(grouped['Opposition'], grouped['Innings Wickets Taken'])]) | |
# merge the resultant DF with the grouped dataframe | |
grouped = grouped.merge(result, how = 'left', left_on = 'Country', right_on = 'Country') | |
# add a new date column | |
grouped['Match Date'] = date | |
# append to the list | |
wkts_taken_list.append(grouped) | |
# concatenate the list of dataframes into a single dataframe | |
wkts_taken_df = pd.concat(wkts_taken_list).sort_values('Match Date', ignore_index = True) | |
# instantiate an empty list to hold the average statistics | |
ODI_team_averages_list = [] | |
# loop over each unique Country | |
for country in wkts_taken_df['Country'].unique(): | |
# slice on the country | |
df1 = wkts_taken_df[wkts_taken_df['Country'] == country].copy() | |
# calculate the cumulative runs scored up to, but not including the current date | |
df1['Cumulative Runs Scored'] = df1['Innings Runs Scored Num'].shift(fill_value = 0).cumsum() | |
# calculate the cumulative wickets lost up to, but not including the current date | |
df1['Cumulative Wickets Lost'] = df1['Innings Wickets Lost'].shift(fill_value = 0).cumsum() | |
# calculate the cumulative overs faced up to, but not including the current date. convert balls faced to overs faced first by dividing by 6 (each over is of 6 balls) | |
df1['Cumulative Overs Faced'] = df1['Innings Balls Faced'].shift(fill_value = 0).cumsum() / 6 | |
# calculate the cumulative runs conceded up to, but not including the current date | |
df1['Cumulative Runs Conceded'] = df1['Innings Runs Conceded'].shift(fill_value = 0).cumsum() | |
# calculate the cumulative overs bowled up to, but not including the current date | |
df1['Cumulative Overs Bowled'] = (df1['Innings Overs Bowled']).shift(fill_value = 0).cumsum() | |
# calculate the cumulative wickets taken up to, but not including the current date | |
df1['Cumulative Wickets Taken'] = (df1['Innings Wickets Taken']).shift(fill_value = 0).cumsum() | |
# append to the list | |
ODI_team_averages_list.append(df1) | |
# concatenate the list of DFs and sort by Match Date | |
ODI_team_averages_df = pd.concat(ODI_team_averages_list).sort_values('Match Date', ignore_index = True) | |
# calculate the 4 new features | |
ODI_team_averages_df['batting average'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Wickets Lost'] | |
ODI_team_averages_df['bowling average'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Wickets Taken'] | |
ODI_team_averages_df['batting RPO'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Overs Faced'] | |
ODI_team_averages_df['bowling RPO'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Overs Bowled'] | |
# replace empty values with 0 | |
ODI_team_averages_df.replace(np.nan, 0, inplace = True) | |
# drop the redundant columns to retain only the 4 new features (averages & RPO), Country and Match Date | |
ODI_team_averages_df.drop(columns = ['Opposition', 'Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken', 'Innings Wickets Lost', 'Cumulative Runs Scored', 'Cumulative Wickets Lost', 'Cumulative Overs Faced', 'Cumulative Runs Conceded', 'Cumulative Overs Bowled', 'Cumulative Wickets Taken'], inplace = True) | |
# merge these 4 features in our main final_data DF | |
final_data = pd.merge(final_data, ODI_team_averages_df, how = 'inner', on = ['Country', 'Match Date']) | |
# Toss Won By | |
# rename the column | |
final_data.rename(columns = {'Toss Won By': 'Toss Won?'}, inplace = True) | |
# replace the country name in this column by Yes or No based on whether the column had the same name as in the Country column or not | |
final_data['Toss Won?'] = np.where(final_data['Toss Won?'] == final_data['Country'], 'Yes', 'No') | |
# Player Rankings | |
# loop over all row indices that are not null in the 'Country_Player_12' column | |
for idx in final_data[(~(final_data['Country_Player_12'].isnull()))].index: | |
final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_1'):final_data.columns.get_loc('Country_Bowling Rank_12')].mean() | |
final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_1'):final_data.columns.get_loc('Opposition_Bowling Rank_12')].mean() | |
final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_1'):final_data.columns.get_loc('Country_Batting Rank_12')].mean() | |
final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_1'):final_data.columns.get_loc('Opposition_Batting Rank_12')].mean() | |
# Drop the 4 ranking columns for the 12th player | |
final_data.drop(columns = ['Country_Bowling Rank_12', 'Country_Batting Rank_12', 'Opposition_Bowling Rank_12', 'Opposition_Batting Rank_12'], inplace = True) | |
# fill missing ranks with 101 | |
final_data.fillna(101, inplace = True) | |
# Create four new features equal to the sum of the relevant rankings | |
final_data['Country Total Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].sum(axis = 1) | |
final_data['Country Total Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].sum(axis = 1) | |
final_data['Opposition Total Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].sum(axis = 1) | |
final_data['Opposition Total Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].sum(axis = 1) | |
# Create four new features equal to the mean of the relevant rankings | |
final_data['Country Average Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].mean(axis = 1) | |
final_data['Country Average Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].mean(axis = 1) | |
final_data['Opposition Average Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].mean(axis = 1) | |
final_data['Opposition Average Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].mean(axis = 1) | |
# Create four new features equal to the median of the relevant rankings | |
final_data['Country Median Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].median(axis = 1) | |
final_data['Country Median Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].median(axis = 1) | |
final_data['Opposition Median Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].median(axis = 1) | |
final_data['Opposition Median Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].median(axis = 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment