Skip to content

Instantly share code, notes, and snippets.

@finlytics-hub
Created October 2, 2020 18:06
Show Gist options
  • Save finlytics-hub/18ca52500cebb77760798ae592629196 to your computer and use it in GitHub Desktop.
Save finlytics-hub/18ca52500cebb77760798ae592629196 to your computer and use it in GitHub Desktop.
ODI: Feature Engineering
# create a new Match Month column
final_data['Match Month'] = final_data['Match Date'].dt.month
## win/loss ratio
# instantiate an empty list
win_loss_ratio = []
# loop over the final_data DF for each playing Country in it
for c in final_data['Country'].unique():
# slice all matches with 'c' country in the 'Country' column
df1 = final_data[final_data['Country'] == c].copy()
# calculate the number of matches won by 'c' up to, but not including, the current match's result
df1['Wins Count'] = ((df1['Result'] == 'Won') & (df1['Country'] == c)).shift(fill_value = 0).cumsum()
# calculate the number of matches lost by 'c' up to, but not including, the current match's result
df1['Losses Count'] = ((df1['Result'] == 'Lost') & (df1['Country'] == c)).shift(fill_value = 0).cumsum()
# append the resulting df to the win_loss_ratio list
win_loss_ratio.append(df1)
# concatenate the list in to a df
final_data = pd.concat(win_loss_ratio)
# convert the 2 new columns to numeric
final_data['Wins Count'] = pd.to_numeric(final_data['Wins Count'], downcast = 'integer')
final_data['Losses Count'] = pd.to_numeric(final_data['Losses Count'], downcast = 'integer')
# sort
final_data.sort_values(by = 'Match Date', inplace = True, ignore_index = True)
# calculate Win/Loss Ratio and replace 'inf' with 0 ('inf' would be the result of DivisionByZero error)
final_data['Win/Loss Ratio'] = (final_data['Wins Count'] / final_data['Losses Count']).replace(to_replace = {np.inf: 0, -np.inf: 0, np.nan: 0})
# drop the now redundant 2 columns
final_data.drop(columns = ['Wins Count', 'Losses Count'], inplace = True)
# calculate batting/bowling averages and RPO
# work on a backup of the original DF
ODI_innings_for_averages = ODI_innings.copy()
# drop columns not required to calculate the 4 new features
ODI_innings_for_averages.drop(columns = ['Innings Player', 'Innings Runs Scored', 'Innings Minutes Batted', 'Innings Batted Flag', 'Innings Not Out Flag', 'Innings Boundary Fours', 'Innings Boundary Sixes', 'Innings Batting Strike Rate', 'Innings Number', '50\'s', '100\'s', 'Innings Runs Scored Buckets', 'Innings Bowled Flag', 'Innings Maidens Bowled', '4 Wickets', '5 Wickets', '10 Wickets', 'Innings Wickets Taken Buckets', 'Innings Economy Rate'], inplace = True)
# remove 'v' from the Opposition column
ODI_innings_for_averages['Opposition'] = ODI_innings_for_averages['Opposition'].str.replace('v ', '')
# replace certain other object dtypes data in the DF
ODI_innings_for_averages.replace(to_replace = {'-': 0, 'DNB': 0, 'TDNB': 0, 'sub': 0}, inplace = True)
# convert the relevant columns to numeric
col_to_convert = ['Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken']
ODI_innings_for_averages[col_to_convert] = ODI_innings_for_averages[col_to_convert].apply(pd.to_numeric)
# convert 'Innings Date' to datetime
ODI_innings_for_averages['Innings Date'] = pd.to_datetime(ODI_innings_for_averages['Innings Date'])
# define a function to calculate wickets lost using info from 'Innings Wickets Taken'
def wkts_lost(country, wickets_taken):
# instantiate an empty dictionary
wkt_lost_dict = {}
# append the name of the country (which will in fact be the name in the Opposition column) to the Country key
wkt_lost_dict.setdefault('Country', []).append(country)
# append the integer in Innings Wickets Taken to Innings Wicket Lost key
wkt_lost_dict.setdefault('Innings Wickets Lost', []).append(wickets_taken)
# convert the dictionary to a DF
wkt_lost_df = pd.DataFrame(wkt_lost_dict)
# return the DF
return wkt_lost_df
# instantiate an empty list
wkts_taken_list = []
# next we loop over all records and store the wickets taken by the opposition to wickets lost by a team
for date in ODI_innings_for_averages['Innings Date'].unique():
# select all records for this date
df1 = ODI_innings_for_averages[ODI_innings_for_averages['Innings Date'] == date]
# group by on Country and Opposition and summ all the other columns
grouped = df1.groupby(by = ['Country', 'Opposition']).sum().reset_index()
# list comprehension to apply our wkts_lost function on each row of the grouped dataframe. Then convert to a dataframe through pd.concat
result = pd.concat([wkts_lost(x, y) for x, y in zip(grouped['Opposition'], grouped['Innings Wickets Taken'])])
# merge the resultant DF with the grouped dataframe
grouped = grouped.merge(result, how = 'left', left_on = 'Country', right_on = 'Country')
# add a new date column
grouped['Match Date'] = date
# append to the list
wkts_taken_list.append(grouped)
# concatenate the list of dataframes into a single dataframe
wkts_taken_df = pd.concat(wkts_taken_list).sort_values('Match Date', ignore_index = True)
# instantiate an empty list to hold the average statistics
ODI_team_averages_list = []
# loop over each unique Country
for country in wkts_taken_df['Country'].unique():
# slice on the country
df1 = wkts_taken_df[wkts_taken_df['Country'] == country].copy()
# calculate the cumulative runs scored up to, but not including the current date
df1['Cumulative Runs Scored'] = df1['Innings Runs Scored Num'].shift(fill_value = 0).cumsum()
# calculate the cumulative wickets lost up to, but not including the current date
df1['Cumulative Wickets Lost'] = df1['Innings Wickets Lost'].shift(fill_value = 0).cumsum()
# calculate the cumulative overs faced up to, but not including the current date. convert balls faced to overs faced first by dividing by 6 (each over is of 6 balls)
df1['Cumulative Overs Faced'] = df1['Innings Balls Faced'].shift(fill_value = 0).cumsum() / 6
# calculate the cumulative runs conceded up to, but not including the current date
df1['Cumulative Runs Conceded'] = df1['Innings Runs Conceded'].shift(fill_value = 0).cumsum()
# calculate the cumulative overs bowled up to, but not including the current date
df1['Cumulative Overs Bowled'] = (df1['Innings Overs Bowled']).shift(fill_value = 0).cumsum()
# calculate the cumulative wickets taken up to, but not including the current date
df1['Cumulative Wickets Taken'] = (df1['Innings Wickets Taken']).shift(fill_value = 0).cumsum()
# append to the list
ODI_team_averages_list.append(df1)
# concatenate the list of DFs and sort by Match Date
ODI_team_averages_df = pd.concat(ODI_team_averages_list).sort_values('Match Date', ignore_index = True)
# calculate the 4 new features
ODI_team_averages_df['batting average'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Wickets Lost']
ODI_team_averages_df['bowling average'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Wickets Taken']
ODI_team_averages_df['batting RPO'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Overs Faced']
ODI_team_averages_df['bowling RPO'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Overs Bowled']
# replace empty values with 0
ODI_team_averages_df.replace(np.nan, 0, inplace = True)
# drop the redundant columns to retain only the 4 new features (averages & RPO), Country and Match Date
ODI_team_averages_df.drop(columns = ['Opposition', 'Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken', 'Innings Wickets Lost', 'Cumulative Runs Scored', 'Cumulative Wickets Lost', 'Cumulative Overs Faced', 'Cumulative Runs Conceded', 'Cumulative Overs Bowled', 'Cumulative Wickets Taken'], inplace = True)
# merge these 4 features in our main final_data DF
final_data = pd.merge(final_data, ODI_team_averages_df, how = 'inner', on = ['Country', 'Match Date'])
# Toss Won By
# rename the column
final_data.rename(columns = {'Toss Won By': 'Toss Won?'}, inplace = True)
# replace the country name in this column by Yes or No based on whether the column had the same name as in the Country column or not
final_data['Toss Won?'] = np.where(final_data['Toss Won?'] == final_data['Country'], 'Yes', 'No')
# Player Rankings
# loop over all row indices that are not null in the 'Country_Player_12' column
for idx in final_data[(~(final_data['Country_Player_12'].isnull()))].index:
final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_1'):final_data.columns.get_loc('Country_Bowling Rank_12')].mean()
final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_1'):final_data.columns.get_loc('Opposition_Bowling Rank_12')].mean()
final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_1'):final_data.columns.get_loc('Country_Batting Rank_12')].mean()
final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_1'):final_data.columns.get_loc('Opposition_Batting Rank_12')].mean()
# Drop the 4 ranking columns for the 12th player
final_data.drop(columns = ['Country_Bowling Rank_12', 'Country_Batting Rank_12', 'Opposition_Bowling Rank_12', 'Opposition_Batting Rank_12'], inplace = True)
# fill missing ranks with 101
final_data.fillna(101, inplace = True)
# Create four new features equal to the sum of the relevant rankings
final_data['Country Total Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].sum(axis = 1)
final_data['Country Total Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].sum(axis = 1)
final_data['Opposition Total Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].sum(axis = 1)
final_data['Opposition Total Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].sum(axis = 1)
# Create four new features equal to the mean of the relevant rankings
final_data['Country Average Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].mean(axis = 1)
final_data['Country Average Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].mean(axis = 1)
final_data['Opposition Average Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].mean(axis = 1)
final_data['Opposition Average Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].mean(axis = 1)
# Create four new features equal to the median of the relevant rankings
final_data['Country Median Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].median(axis = 1)
final_data['Country Median Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].median(axis = 1)
final_data['Opposition Median Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].median(axis = 1)
final_data['Opposition Median Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].median(axis = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment