finlytics-hub/ODI_data_munging_1.py

## ODI_data_munging_1.py
# ODI_results DF
# create a copy of the DF
ODI_results_clean = ODI_results.copy()
# split Match column into two separate columns
ODI_results_clean[['Country 1', 'Country 2']] = ODI_results_clean['Match'].str.split('v', expand = True)
# strip out all the leading and trailing whitespaces
ODI_results_clean['Country 1'] = ODI_results_clean['Country 1'].str.strip()
ODI_results_clean['Country 2'] = ODI_results_clean['Country 2'].str.strip()
# create the Opposition column using information from the 2 newly created columns
ODI_results_clean['Opposition'] = np.where(ODI_results_clean['Country 1'] == ODI_results_clean['Country'], ODI_results_clean['Country 2'], ODI_results_clean['Country 1'])
# drop the 2 new columns together with other unwanted columns
ODI_results_clean.drop(columns = ['Margin', 'Match', 'Match Month', 'Match Year', 'Match Period', 'Matches', 'Country 1', 'Country 2'], inplace = True)
# filter out any match results other than 'Won' or 'Lost' so as not to consider matches not played, tied and with no result in our modeling
ODI_results_clean = ODI_results_clean[ODI_results_clean['Result'].isin(['Won', 'Lost'])]
# convert the Match Date column to a datetime format
ODI_results_clean['Match Date'] = pd.to_datetime(ODI_results_clean['Match Date'])
# sort by Match Date
ODI_results_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True)


# ODI_innings DF
# create a copy of the DF
ODI_innings_clean = ODI_innings.copy()
# drop duplicate records based on a subset of columns only
ODI_innings_clean.drop_duplicates(subset = ['Innings Player', 'Country', 'Opposition', 'Innings Date', 'Ground'], inplace = True)
# drop all the unwanted columns
col_to_drop = ODI_innings_clean.iloc[:, np.r_[1:11, 15:28]].columns.values
ODI_innings_clean.drop(columns = col_to_drop, inplace = True)
# remove 'v' from the Opposition columns
ODI_innings_clean['Opposition'] = ODI_innings_clean['Opposition'].str.replace('v', '')
# rename certain columns
ODI_innings_clean.rename(columns = {'Innings Date': 'Match Date', 'Innings Player': 'Players'}, inplace = True)
# convert the Match Date column to a datetime format
ODI_innings_clean['Match Date'] = pd.to_datetime(ODI_innings_clean['Match Date'])
# sort by Match Date
ODI_innings_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True)

# first merge the 2 on Country level to get Country's players merged with ODI_results_clean. note the use of 'left' join here and not an 'inner' join as we want to retain all the results information from the 'left' DF even if the corresponding team's players information is not available
ODI_country_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Country'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Opposition_y'])
# rename columns
ODI_country_players.rename(columns = {'Players': 'Country Players', 'Opposition_x': 'Opposition'}, inplace = True)

# next merge on the Opposition columns to get the names of the Opposition players
ODI_opposition_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Opposition'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Country_y', 'Opposition_y'])
# rename columns
ODI_opposition_players.rename(columns = {'Players': 'Opposition Players', 'Country_x': 'Country', 'Opposition_x': 'Opposition'}, inplace = True)

# define a function to change (or unstack) our DFs
def df_unstack(df, players_col, col_name):
    grouped = df.groupby(by = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result'])
    df_flat = grouped[players_col].apply(lambda x: pd.Series(x.values)).unstack().reset_index()
    df_flat.rename(columns = {i: col_name + '_{}'.format(i + 1) for i in range(df_flat.shape[1])}, inplace = True)
    return df_flat

# apply the function to the 2 DFs
ODI_country_players_flat = df_unstack(ODI_country_players, 'Country Players', 'Country_Player')
ODI_opposition_players_flat = df_unstack(ODI_opposition_players, 'Opposition Players', 'Opposition_Player')

# drop records with no players information
ODI_country_players_flat.dropna(subset = ['Country_Player_1'], inplace = True)
ODI_opposition_players_flat.dropna(subset = ['Opposition_Player_1'], inplace = True)

# another merge
ODI_matches = pd.merge(ODI_country_players_flat, ODI_opposition_players_flat, how = 'inner', on = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result'])

# replace some of the players' names that I found through trial and error to make sure they are same as per the ICC Rankings data
ODI_matches.replace(to_replace = {'Sir RJ Hadlee': 'RJ Hadlee', 'Saleem Altaf': 'Salim Altaf', 'Saleem Pervez': 'Salim Pervez', 'Saleem Malik': 'Salim Malik', 'Saleem Yousuf': 'Salim Yousuf', 'Saleem Jaffar': 'Salim Jaffar', 'Saleem Elahi': 'Salim Elahi', 'Aamer Sohail': 'Aamir Sohail', 'Ijaz Ahmed': 'Ijaz Ahmed, sen.', 'Aaqib Javed': 'Aqib Javed', 'Rameez Raja': 'Ramiz Raja', 'Haroon Rasheed': 'Haroon Rashid', 'Shakeel Khan': 'Shakil Khan', 'Shaheen Shah Afridi': 'Shaheen Afridi', 'Z Khan': 'Zaheer Khan', 'Mohammed Shami': 'Mohammad Shami', 'Mehidy Hasan Miraz': 'Mehidy Hasan'}, inplace = True)
	# ODI_results DF
	# create a copy of the DF
	ODI_results_clean = ODI_results.copy()
	# split Match column into two separate columns
	ODI_results_clean[['Country 1', 'Country 2']] = ODI_results_clean['Match'].str.split('v', expand = True)
	# strip out all the leading and trailing whitespaces
	ODI_results_clean['Country 1'] = ODI_results_clean['Country 1'].str.strip()
	ODI_results_clean['Country 2'] = ODI_results_clean['Country 2'].str.strip()
	# create the Opposition column using information from the 2 newly created columns
	ODI_results_clean['Opposition'] = np.where(ODI_results_clean['Country 1'] == ODI_results_clean['Country'], ODI_results_clean['Country 2'], ODI_results_clean['Country 1'])
	# drop the 2 new columns together with other unwanted columns
	ODI_results_clean.drop(columns = ['Margin', 'Match', 'Match Month', 'Match Year', 'Match Period', 'Matches', 'Country 1', 'Country 2'], inplace = True)
	# filter out any match results other than 'Won' or 'Lost' so as not to consider matches not played, tied and with no result in our modeling
	ODI_results_clean = ODI_results_clean[ODI_results_clean['Result'].isin(['Won', 'Lost'])]
	# convert the Match Date column to a datetime format
	ODI_results_clean['Match Date'] = pd.to_datetime(ODI_results_clean['Match Date'])
	# sort by Match Date
	ODI_results_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True)


	# ODI_innings DF
	# create a copy of the DF
	ODI_innings_clean = ODI_innings.copy()
	# drop duplicate records based on a subset of columns only
	ODI_innings_clean.drop_duplicates(subset = ['Innings Player', 'Country', 'Opposition', 'Innings Date', 'Ground'], inplace = True)
	# drop all the unwanted columns
	col_to_drop = ODI_innings_clean.iloc[:, np.r_[1:11, 15:28]].columns.values
	ODI_innings_clean.drop(columns = col_to_drop, inplace = True)
	# remove 'v' from the Opposition columns
	ODI_innings_clean['Opposition'] = ODI_innings_clean['Opposition'].str.replace('v', '')
	# rename certain columns
	ODI_innings_clean.rename(columns = {'Innings Date': 'Match Date', 'Innings Player': 'Players'}, inplace = True)
	# convert the Match Date column to a datetime format
	ODI_innings_clean['Match Date'] = pd.to_datetime(ODI_innings_clean['Match Date'])
	# sort by Match Date
	ODI_innings_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True)

	# first merge the 2 on Country level to get Country's players merged with ODI_results_clean. note the use of 'left' join here and not an 'inner' join as we want to retain all the results information from the 'left' DF even if the corresponding team's players information is not available
	ODI_country_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Country'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Opposition_y'])
	# rename columns
	ODI_country_players.rename(columns = {'Players': 'Country Players', 'Opposition_x': 'Opposition'}, inplace = True)

	# next merge on the Opposition columns to get the names of the Opposition players
	ODI_opposition_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Opposition'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Country_y', 'Opposition_y'])
	# rename columns
	ODI_opposition_players.rename(columns = {'Players': 'Opposition Players', 'Country_x': 'Country', 'Opposition_x': 'Opposition'}, inplace = True)

	# define a function to change (or unstack) our DFs
	def df_unstack(df, players_col, col_name):
	grouped = df.groupby(by = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result'])
	df_flat = grouped[players_col].apply(lambda x: pd.Series(x.values)).unstack().reset_index()
	df_flat.rename(columns = {i: col_name + '_{}'.format(i + 1) for i in range(df_flat.shape[1])}, inplace = True)
	return df_flat

	# apply the function to the 2 DFs
	ODI_country_players_flat = df_unstack(ODI_country_players, 'Country Players', 'Country_Player')
	ODI_opposition_players_flat = df_unstack(ODI_opposition_players, 'Opposition Players', 'Opposition_Player')

	# drop records with no players information
	ODI_country_players_flat.dropna(subset = ['Country_Player_1'], inplace = True)
	ODI_opposition_players_flat.dropna(subset = ['Opposition_Player_1'], inplace = True)

	# another merge
	ODI_matches = pd.merge(ODI_country_players_flat, ODI_opposition_players_flat, how = 'inner', on = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result'])

	# replace some of the players' names that I found through trial and error to make sure they are same as per the ICC Rankings data
	ODI_matches.replace(to_replace = {'Sir RJ Hadlee': 'RJ Hadlee', 'Saleem Altaf': 'Salim Altaf', 'Saleem Pervez': 'Salim Pervez', 'Saleem Malik': 'Salim Malik', 'Saleem Yousuf': 'Salim Yousuf', 'Saleem Jaffar': 'Salim Jaffar', 'Saleem Elahi': 'Salim Elahi', 'Aamer Sohail': 'Aamir Sohail', 'Ijaz Ahmed': 'Ijaz Ahmed, sen.', 'Aaqib Javed': 'Aqib Javed', 'Rameez Raja': 'Ramiz Raja', 'Haroon Rasheed': 'Haroon Rashid', 'Shakeel Khan': 'Shakil Khan', 'Shaheen Shah Afridi': 'Shaheen Afridi', 'Z Khan': 'Zaheer Khan', 'Mohammed Shami': 'Mohammad Shami', 'Mehidy Hasan Miraz': 'Mehidy Hasan'}, inplace = True)