finlytics-hub/ODI_feature_engineering.py

## ODI_feature_engineering.py
# create a new Match Month column
final_data['Match Month'] = final_data['Match Date'].dt.month


## win/loss ratio
# instantiate an empty list
win_loss_ratio = []
# loop over the final_data DF for each playing Country in it
for c in final_data['Country'].unique():
    # slice all matches with 'c' country in the 'Country' column
    df1 = final_data[final_data['Country'] == c].copy()
    # calculate the number of matches won by 'c' up to, but not including, the current match's result
    df1['Wins Count'] = ((df1['Result'] == 'Won') & (df1['Country'] == c)).shift(fill_value = 0).cumsum()
    # calculate the number of matches lost by 'c' up to, but not including, the current match's result
    df1['Losses Count'] = ((df1['Result'] == 'Lost') & (df1['Country'] == c)).shift(fill_value = 0).cumsum()
    # append the resulting df to the win_loss_ratio list
    win_loss_ratio.append(df1)

# concatenate the list in to a df
final_data = pd.concat(win_loss_ratio)
# convert the 2 new columns to numeric
final_data['Wins Count'] = pd.to_numeric(final_data['Wins Count'], downcast = 'integer')
final_data['Losses Count'] = pd.to_numeric(final_data['Losses Count'], downcast = 'integer')
# sort
final_data.sort_values(by = 'Match Date', inplace = True, ignore_index = True)
# calculate Win/Loss Ratio and replace 'inf' with 0 ('inf' would be the result of DivisionByZero error)
final_data['Win/Loss Ratio'] = (final_data['Wins Count'] / final_data['Losses Count']).replace(to_replace = {np.inf: 0, -np.inf: 0, np.nan: 0})
# drop the now redundant 2 columns
final_data.drop(columns = ['Wins Count', 'Losses Count'], inplace = True)


# calculate batting/bowling averages and RPO
# work on a backup of the original DF
ODI_innings_for_averages = ODI_innings.copy()
# drop columns not required to calculate the 4 new features
ODI_innings_for_averages.drop(columns = ['Innings Player', 'Innings Runs Scored', 'Innings Minutes Batted', 'Innings Batted Flag', 'Innings Not Out Flag', 'Innings Boundary Fours', 'Innings Boundary Sixes', 'Innings Batting Strike Rate', 'Innings Number', '50\'s', '100\'s', 'Innings Runs Scored Buckets', 'Innings Bowled Flag', 'Innings Maidens Bowled', '4 Wickets', '5 Wickets', '10 Wickets', 'Innings Wickets Taken Buckets', 'Innings Economy Rate'], inplace = True)
# remove 'v' from the Opposition column
ODI_innings_for_averages['Opposition'] = ODI_innings_for_averages['Opposition'].str.replace('v ', '')
# replace certain other object dtypes data in the DF
ODI_innings_for_averages.replace(to_replace = {'-': 0, 'DNB': 0, 'TDNB': 0, 'sub': 0}, inplace = True)
# convert the relevant columns to numeric
col_to_convert = ['Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken']
ODI_innings_for_averages[col_to_convert] = ODI_innings_for_averages[col_to_convert].apply(pd.to_numeric)
# convert 'Innings Date' to datetime
ODI_innings_for_averages['Innings Date'] = pd.to_datetime(ODI_innings_for_averages['Innings Date'])

# define a function to calculate wickets lost using info from 'Innings Wickets Taken'
def wkts_lost(country, wickets_taken):
    # instantiate an empty dictionary
    wkt_lost_dict = {}
    # append the name of the country (which will in fact be the name in the Opposition column) to the Country key
    wkt_lost_dict.setdefault('Country', []).append(country)
    # append the integer in Innings Wickets Taken to Innings Wicket Lost key
    wkt_lost_dict.setdefault('Innings Wickets Lost', []).append(wickets_taken)
    # convert the dictionary to a DF
    wkt_lost_df = pd.DataFrame(wkt_lost_dict)
    # return the DF
    return wkt_lost_df

# instantiate an empty list
wkts_taken_list = []

# next we loop over all records and store the wickets taken by the opposition to wickets lost by a team
for date in ODI_innings_for_averages['Innings Date'].unique():
    # select all records for this date
    df1 = ODI_innings_for_averages[ODI_innings_for_averages['Innings Date'] == date]
    # group by on Country and Opposition and summ all the other columns
    grouped = df1.groupby(by = ['Country', 'Opposition']).sum().reset_index()
    # list comprehension to apply our wkts_lost function on each row of the grouped dataframe. Then convert to a dataframe through pd.concat
    result = pd.concat([wkts_lost(x, y) for x, y in zip(grouped['Opposition'], grouped['Innings Wickets Taken'])])
    # merge the resultant DF with the grouped dataframe
    grouped = grouped.merge(result, how = 'left', left_on = 'Country', right_on = 'Country')
    # add a new date column
    grouped['Match Date'] = date
    # append to the list
    wkts_taken_list.append(grouped)

# concatenate the list of dataframes into a single dataframe
wkts_taken_df = pd.concat(wkts_taken_list).sort_values('Match Date', ignore_index = True)

# instantiate an empty list to hold the average statistics
ODI_team_averages_list = []

# loop over each unique Country
for country in wkts_taken_df['Country'].unique():
    # slice on the country
    df1 = wkts_taken_df[wkts_taken_df['Country'] == country].copy()
    # calculate the cumulative runs scored up to, but not including the current date
    df1['Cumulative Runs Scored'] = df1['Innings Runs Scored Num'].shift(fill_value = 0).cumsum()
    # calculate the cumulative wickets lost up to, but not including the current date
    df1['Cumulative Wickets Lost'] = df1['Innings Wickets Lost'].shift(fill_value = 0).cumsum()
    # calculate the cumulative overs faced up to, but not including the current date. convert balls faced to overs faced first by dividing by 6 (each over is of 6 balls)
    df1['Cumulative Overs Faced'] = df1['Innings Balls Faced'].shift(fill_value = 0).cumsum() / 6
    # calculate the cumulative runs conceded up to, but not including the current date
    df1['Cumulative Runs Conceded'] = df1['Innings Runs Conceded'].shift(fill_value = 0).cumsum()
    # calculate the cumulative overs bowled up to, but not including the current date
    df1['Cumulative Overs Bowled'] = (df1['Innings Overs Bowled']).shift(fill_value = 0).cumsum()
    # calculate the cumulative wickets taken up to, but not including the current date
    df1['Cumulative Wickets Taken'] = (df1['Innings Wickets Taken']).shift(fill_value = 0).cumsum()
    # append to the list
    ODI_team_averages_list.append(df1)

# concatenate the list of DFs and sort by Match Date
ODI_team_averages_df = pd.concat(ODI_team_averages_list).sort_values('Match Date', ignore_index = True)
# calculate the 4 new features
ODI_team_averages_df['batting average'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Wickets Lost']
ODI_team_averages_df['bowling average'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Wickets Taken']
ODI_team_averages_df['batting RPO'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Overs Faced']
ODI_team_averages_df['bowling RPO'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Overs Bowled']
# replace empty values with 0
ODI_team_averages_df.replace(np.nan, 0, inplace = True)

# drop the redundant columns to retain only the 4 new features (averages & RPO), Country and Match Date
ODI_team_averages_df.drop(columns = ['Opposition', 'Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken', 'Innings Wickets Lost', 'Cumulative Runs Scored', 'Cumulative Wickets Lost', 'Cumulative Overs Faced', 'Cumulative Runs Conceded', 'Cumulative Overs Bowled', 'Cumulative Wickets Taken'], inplace = True)
# merge these 4 features in our main final_data DF
final_data = pd.merge(final_data, ODI_team_averages_df, how = 'inner', on = ['Country', 'Match Date'])


# Toss Won By
# rename the column
final_data.rename(columns = {'Toss Won By': 'Toss Won?'}, inplace = True)
# replace the country name in this column by Yes or No based on whether the column had the same name as in the Country column or not
final_data['Toss Won?'] = np.where(final_data['Toss Won?'] == final_data['Country'], 'Yes', 'No')


# Player Rankings
# loop over all row indices that are not null in the 'Country_Player_12' column
for idx in final_data[(~(final_data['Country_Player_12'].isnull()))].index:
    final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_1'):final_data.columns.get_loc('Country_Bowling Rank_12')].mean()
    final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_1'):final_data.columns.get_loc('Opposition_Bowling Rank_12')].mean()
    final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_1'):final_data.columns.get_loc('Country_Batting Rank_12')].mean()
    final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_1'):final_data.columns.get_loc('Opposition_Batting Rank_12')].mean()

# Drop the 4 ranking columns for the 12th player
final_data.drop(columns = ['Country_Bowling Rank_12', 'Country_Batting Rank_12', 'Opposition_Bowling Rank_12', 'Opposition_Batting Rank_12'], inplace = True)

# fill missing ranks with 101
final_data.fillna(101, inplace = True)

# Create four new features equal to the sum of the relevant rankings
final_data['Country Total Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].sum(axis = 1)
final_data['Country Total Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].sum(axis = 1)
final_data['Opposition Total Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].sum(axis = 1)
final_data['Opposition Total Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].sum(axis = 1)

# Create four new features equal to the mean of the relevant rankings
final_data['Country Average Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].mean(axis = 1)
final_data['Country Average Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].mean(axis = 1)
final_data['Opposition Average Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].mean(axis = 1)
final_data['Opposition Average Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].mean(axis = 1)

# Create four new features equal to the median of the relevant rankings
final_data['Country Median Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].median(axis = 1)
final_data['Country Median Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].median(axis = 1)
final_data['Opposition Median Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].median(axis = 1)
final_data['Opposition Median Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].median(axis = 1)
	# create a new Match Month column
	final_data['Match Month'] = final_data['Match Date'].dt.month


	## win/loss ratio
	# instantiate an empty list
	win_loss_ratio = []
	# loop over the final_data DF for each playing Country in it
	for c in final_data['Country'].unique():
	# slice all matches with 'c' country in the 'Country' column
	df1 = final_data[final_data['Country'] == c].copy()
	# calculate the number of matches won by 'c' up to, but not including, the current match's result
	df1['Wins Count'] = ((df1['Result'] == 'Won') & (df1['Country'] == c)).shift(fill_value = 0).cumsum()
	# calculate the number of matches lost by 'c' up to, but not including, the current match's result
	df1['Losses Count'] = ((df1['Result'] == 'Lost') & (df1['Country'] == c)).shift(fill_value = 0).cumsum()
	# append the resulting df to the win_loss_ratio list
	win_loss_ratio.append(df1)

	# concatenate the list in to a df
	final_data = pd.concat(win_loss_ratio)
	# convert the 2 new columns to numeric
	final_data['Wins Count'] = pd.to_numeric(final_data['Wins Count'], downcast = 'integer')
	final_data['Losses Count'] = pd.to_numeric(final_data['Losses Count'], downcast = 'integer')
	# sort
	final_data.sort_values(by = 'Match Date', inplace = True, ignore_index = True)
	# calculate Win/Loss Ratio and replace 'inf' with 0 ('inf' would be the result of DivisionByZero error)
	final_data['Win/Loss Ratio'] = (final_data['Wins Count'] / final_data['Losses Count']).replace(to_replace = {np.inf: 0, -np.inf: 0, np.nan: 0})
	# drop the now redundant 2 columns
	final_data.drop(columns = ['Wins Count', 'Losses Count'], inplace = True)


	# calculate batting/bowling averages and RPO
	# work on a backup of the original DF
	ODI_innings_for_averages = ODI_innings.copy()
	# drop columns not required to calculate the 4 new features
	ODI_innings_for_averages.drop(columns = ['Innings Player', 'Innings Runs Scored', 'Innings Minutes Batted', 'Innings Batted Flag', 'Innings Not Out Flag', 'Innings Boundary Fours', 'Innings Boundary Sixes', 'Innings Batting Strike Rate', 'Innings Number', '50\'s', '100\'s', 'Innings Runs Scored Buckets', 'Innings Bowled Flag', 'Innings Maidens Bowled', '4 Wickets', '5 Wickets', '10 Wickets', 'Innings Wickets Taken Buckets', 'Innings Economy Rate'], inplace = True)
	# remove 'v' from the Opposition column
	ODI_innings_for_averages['Opposition'] = ODI_innings_for_averages['Opposition'].str.replace('v ', '')
	# replace certain other object dtypes data in the DF
	ODI_innings_for_averages.replace(to_replace = {'-': 0, 'DNB': 0, 'TDNB': 0, 'sub': 0}, inplace = True)
	# convert the relevant columns to numeric
	col_to_convert = ['Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken']
	ODI_innings_for_averages[col_to_convert] = ODI_innings_for_averages[col_to_convert].apply(pd.to_numeric)
	# convert 'Innings Date' to datetime
	ODI_innings_for_averages['Innings Date'] = pd.to_datetime(ODI_innings_for_averages['Innings Date'])

	# define a function to calculate wickets lost using info from 'Innings Wickets Taken'
	def wkts_lost(country, wickets_taken):
	# instantiate an empty dictionary
	wkt_lost_dict = {}
	# append the name of the country (which will in fact be the name in the Opposition column) to the Country key
	wkt_lost_dict.setdefault('Country', []).append(country)
	# append the integer in Innings Wickets Taken to Innings Wicket Lost key
	wkt_lost_dict.setdefault('Innings Wickets Lost', []).append(wickets_taken)
	# convert the dictionary to a DF
	wkt_lost_df = pd.DataFrame(wkt_lost_dict)
	# return the DF
	return wkt_lost_df

	# instantiate an empty list
	wkts_taken_list = []

	# next we loop over all records and store the wickets taken by the opposition to wickets lost by a team
	for date in ODI_innings_for_averages['Innings Date'].unique():
	# select all records for this date
	df1 = ODI_innings_for_averages[ODI_innings_for_averages['Innings Date'] == date]
	# group by on Country and Opposition and summ all the other columns
	grouped = df1.groupby(by = ['Country', 'Opposition']).sum().reset_index()
	# list comprehension to apply our wkts_lost function on each row of the grouped dataframe. Then convert to a dataframe through pd.concat
	result = pd.concat([wkts_lost(x, y) for x, y in zip(grouped['Opposition'], grouped['Innings Wickets Taken'])])
	# merge the resultant DF with the grouped dataframe
	grouped = grouped.merge(result, how = 'left', left_on = 'Country', right_on = 'Country')
	# add a new date column
	grouped['Match Date'] = date
	# append to the list
	wkts_taken_list.append(grouped)

	# concatenate the list of dataframes into a single dataframe
	wkts_taken_df = pd.concat(wkts_taken_list).sort_values('Match Date', ignore_index = True)

	# instantiate an empty list to hold the average statistics
	ODI_team_averages_list = []

	# loop over each unique Country
	for country in wkts_taken_df['Country'].unique():
	# slice on the country
	df1 = wkts_taken_df[wkts_taken_df['Country'] == country].copy()
	# calculate the cumulative runs scored up to, but not including the current date
	df1['Cumulative Runs Scored'] = df1['Innings Runs Scored Num'].shift(fill_value = 0).cumsum()
	# calculate the cumulative wickets lost up to, but not including the current date
	df1['Cumulative Wickets Lost'] = df1['Innings Wickets Lost'].shift(fill_value = 0).cumsum()
	# calculate the cumulative overs faced up to, but not including the current date. convert balls faced to overs faced first by dividing by 6 (each over is of 6 balls)
	df1['Cumulative Overs Faced'] = df1['Innings Balls Faced'].shift(fill_value = 0).cumsum() / 6
	# calculate the cumulative runs conceded up to, but not including the current date
	df1['Cumulative Runs Conceded'] = df1['Innings Runs Conceded'].shift(fill_value = 0).cumsum()
	# calculate the cumulative overs bowled up to, but not including the current date
	df1['Cumulative Overs Bowled'] = (df1['Innings Overs Bowled']).shift(fill_value = 0).cumsum()
	# calculate the cumulative wickets taken up to, but not including the current date
	df1['Cumulative Wickets Taken'] = (df1['Innings Wickets Taken']).shift(fill_value = 0).cumsum()
	# append to the list
	ODI_team_averages_list.append(df1)

	# concatenate the list of DFs and sort by Match Date
	ODI_team_averages_df = pd.concat(ODI_team_averages_list).sort_values('Match Date', ignore_index = True)
	# calculate the 4 new features
	ODI_team_averages_df['batting average'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Wickets Lost']
	ODI_team_averages_df['bowling average'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Wickets Taken']
	ODI_team_averages_df['batting RPO'] = ODI_team_averages_df['Cumulative Runs Scored'] / ODI_team_averages_df['Cumulative Overs Faced']
	ODI_team_averages_df['bowling RPO'] = ODI_team_averages_df['Cumulative Runs Conceded'] / ODI_team_averages_df['Cumulative Overs Bowled']
	# replace empty values with 0
	ODI_team_averages_df.replace(np.nan, 0, inplace = True)

	# drop the redundant columns to retain only the 4 new features (averages & RPO), Country and Match Date
	ODI_team_averages_df.drop(columns = ['Opposition', 'Innings Runs Scored Num', 'Innings Balls Faced', 'Innings Overs Bowled', 'Innings Runs Conceded', 'Innings Wickets Taken', 'Innings Wickets Lost', 'Cumulative Runs Scored', 'Cumulative Wickets Lost', 'Cumulative Overs Faced', 'Cumulative Runs Conceded', 'Cumulative Overs Bowled', 'Cumulative Wickets Taken'], inplace = True)
	# merge these 4 features in our main final_data DF
	final_data = pd.merge(final_data, ODI_team_averages_df, how = 'inner', on = ['Country', 'Match Date'])


	# Toss Won By
	# rename the column
	final_data.rename(columns = {'Toss Won By': 'Toss Won?'}, inplace = True)
	# replace the country name in this column by Yes or No based on whether the column had the same name as in the Country column or not
	final_data['Toss Won?'] = np.where(final_data['Toss Won?'] == final_data['Country'], 'Yes', 'No')


	# Player Rankings
	# loop over all row indices that are not null in the 'Country_Player_12' column
	for idx in final_data[(~(final_data['Country_Player_12'].isnull()))].index:
	final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Bowling Rank_1'):final_data.columns.get_loc('Country_Bowling Rank_12')].mean()
	final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Bowling Rank_1'):final_data.columns.get_loc('Opposition_Bowling Rank_12')].mean()
	final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Country_Batting Rank_1'):final_data.columns.get_loc('Country_Batting Rank_12')].mean()
	final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_11')] = final_data.iloc[idx, final_data.columns.get_loc('Opposition_Batting Rank_1'):final_data.columns.get_loc('Opposition_Batting Rank_12')].mean()

	# Drop the 4 ranking columns for the 12th player
	final_data.drop(columns = ['Country_Bowling Rank_12', 'Country_Batting Rank_12', 'Opposition_Bowling Rank_12', 'Opposition_Batting Rank_12'], inplace = True)

	# fill missing ranks with 101
	final_data.fillna(101, inplace = True)

	# Create four new features equal to the sum of the relevant rankings
	final_data['Country Total Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].sum(axis = 1)
	final_data['Country Total Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].sum(axis = 1)
	final_data['Opposition Total Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].sum(axis = 1)
	final_data['Opposition Total Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].sum(axis = 1)

	# Create four new features equal to the mean of the relevant rankings
	final_data['Country Average Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].mean(axis = 1)
	final_data['Country Average Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].mean(axis = 1)
	final_data['Opposition Average Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].mean(axis = 1)
	final_data['Opposition Average Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].mean(axis = 1)

	# Create four new features equal to the median of the relevant rankings
	final_data['Country Median Bowling Rank'] = final_data[['Country_Bowling Rank_1', 'Country_Bowling Rank_2', 'Country_Bowling Rank_3', 'Country_Bowling Rank_4', 'Country_Bowling Rank_5', 'Country_Bowling Rank_6', 'Country_Bowling Rank_7', 'Country_Bowling Rank_8', 'Country_Bowling Rank_9', 'Country_Bowling Rank_10', 'Country_Bowling Rank_11']].median(axis = 1)
	final_data['Country Median Batting Rank'] = final_data[['Country_Batting Rank_1', 'Country_Batting Rank_2', 'Country_Batting Rank_3', 'Country_Batting Rank_4', 'Country_Batting Rank_5', 'Country_Batting Rank_6', 'Country_Batting Rank_7', 'Country_Batting Rank_8', 'Country_Batting Rank_9', 'Country_Batting Rank_10', 'Country_Batting Rank_11']].median(axis = 1)
	final_data['Opposition Median Bowling Rank'] = final_data[['Opposition_Bowling Rank_1', 'Opposition_Bowling Rank_2', 'Opposition_Bowling Rank_3', 'Opposition_Bowling Rank_4', 'Opposition_Bowling Rank_5', 'Opposition_Bowling Rank_6', 'Opposition_Bowling Rank_7', 'Opposition_Bowling Rank_8', 'Opposition_Bowling Rank_9', 'Opposition_Bowling Rank_10', 'Opposition_Bowling Rank_11']].median(axis = 1)
	final_data['Opposition Median Batting Rank'] = final_data[['Opposition_Batting Rank_1', 'Opposition_Batting Rank_2', 'Opposition_Batting Rank_3', 'Opposition_Batting Rank_4', 'Opposition_Batting Rank_5', 'Opposition_Batting Rank_6', 'Opposition_Batting Rank_7', 'Opposition_Batting Rank_8', 'Opposition_Batting Rank_9', 'Opposition_Batting Rank_10', 'Opposition_Batting Rank_11']].median(axis = 1)