Skip to content

Instantly share code, notes, and snippets.

@finlytics-hub
Last active October 2, 2020 17:21
Show Gist options
  • Save finlytics-hub/07b878a33fdc85af1c7bfa4bc57cb9b1 to your computer and use it in GitHub Desktop.
Save finlytics-hub/07b878a33fdc85af1c7bfa4bc57cb9b1 to your computer and use it in GitHub Desktop.
ODI: data munging Part 1
# ODI_results DF
# create a copy of the DF
ODI_results_clean = ODI_results.copy()
# split Match column into two separate columns
ODI_results_clean[['Country 1', 'Country 2']] = ODI_results_clean['Match'].str.split('v', expand = True)
# strip out all the leading and trailing whitespaces
ODI_results_clean['Country 1'] = ODI_results_clean['Country 1'].str.strip()
ODI_results_clean['Country 2'] = ODI_results_clean['Country 2'].str.strip()
# create the Opposition column using information from the 2 newly created columns
ODI_results_clean['Opposition'] = np.where(ODI_results_clean['Country 1'] == ODI_results_clean['Country'], ODI_results_clean['Country 2'], ODI_results_clean['Country 1'])
# drop the 2 new columns together with other unwanted columns
ODI_results_clean.drop(columns = ['Margin', 'Match', 'Match Month', 'Match Year', 'Match Period', 'Matches', 'Country 1', 'Country 2'], inplace = True)
# filter out any match results other than 'Won' or 'Lost' so as not to consider matches not played, tied and with no result in our modeling
ODI_results_clean = ODI_results_clean[ODI_results_clean['Result'].isin(['Won', 'Lost'])]
# convert the Match Date column to a datetime format
ODI_results_clean['Match Date'] = pd.to_datetime(ODI_results_clean['Match Date'])
# sort by Match Date
ODI_results_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True)
# ODI_innings DF
# create a copy of the DF
ODI_innings_clean = ODI_innings.copy()
# drop duplicate records based on a subset of columns only
ODI_innings_clean.drop_duplicates(subset = ['Innings Player', 'Country', 'Opposition', 'Innings Date', 'Ground'], inplace = True)
# drop all the unwanted columns
col_to_drop = ODI_innings_clean.iloc[:, np.r_[1:11, 15:28]].columns.values
ODI_innings_clean.drop(columns = col_to_drop, inplace = True)
# remove 'v' from the Opposition columns
ODI_innings_clean['Opposition'] = ODI_innings_clean['Opposition'].str.replace('v', '')
# rename certain columns
ODI_innings_clean.rename(columns = {'Innings Date': 'Match Date', 'Innings Player': 'Players'}, inplace = True)
# convert the Match Date column to a datetime format
ODI_innings_clean['Match Date'] = pd.to_datetime(ODI_innings_clean['Match Date'])
# sort by Match Date
ODI_innings_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True)
# first merge the 2 on Country level to get Country's players merged with ODI_results_clean. note the use of 'left' join here and not an 'inner' join as we want to retain all the results information from the 'left' DF even if the corresponding team's players information is not available
ODI_country_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Country'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Opposition_y'])
# rename columns
ODI_country_players.rename(columns = {'Players': 'Country Players', 'Opposition_x': 'Opposition'}, inplace = True)
# next merge on the Opposition columns to get the names of the Opposition players
ODI_opposition_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Opposition'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Country_y', 'Opposition_y'])
# rename columns
ODI_opposition_players.rename(columns = {'Players': 'Opposition Players', 'Country_x': 'Country', 'Opposition_x': 'Opposition'}, inplace = True)
# define a function to change (or unstack) our DFs
def df_unstack(df, players_col, col_name):
grouped = df.groupby(by = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result'])
df_flat = grouped[players_col].apply(lambda x: pd.Series(x.values)).unstack().reset_index()
df_flat.rename(columns = {i: col_name + '_{}'.format(i + 1) for i in range(df_flat.shape[1])}, inplace = True)
return df_flat
# apply the function to the 2 DFs
ODI_country_players_flat = df_unstack(ODI_country_players, 'Country Players', 'Country_Player')
ODI_opposition_players_flat = df_unstack(ODI_opposition_players, 'Opposition Players', 'Opposition_Player')
# drop records with no players information
ODI_country_players_flat.dropna(subset = ['Country_Player_1'], inplace = True)
ODI_opposition_players_flat.dropna(subset = ['Opposition_Player_1'], inplace = True)
# another merge
ODI_matches = pd.merge(ODI_country_players_flat, ODI_opposition_players_flat, how = 'inner', on = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result'])
# replace some of the players' names that I found through trial and error to make sure they are same as per the ICC Rankings data
ODI_matches.replace(to_replace = {'Sir RJ Hadlee': 'RJ Hadlee', 'Saleem Altaf': 'Salim Altaf', 'Saleem Pervez': 'Salim Pervez', 'Saleem Malik': 'Salim Malik', 'Saleem Yousuf': 'Salim Yousuf', 'Saleem Jaffar': 'Salim Jaffar', 'Saleem Elahi': 'Salim Elahi', 'Aamer Sohail': 'Aamir Sohail', 'Ijaz Ahmed': 'Ijaz Ahmed, sen.', 'Aaqib Javed': 'Aqib Javed', 'Rameez Raja': 'Ramiz Raja', 'Haroon Rasheed': 'Haroon Rashid', 'Shakeel Khan': 'Shakil Khan', 'Shaheen Shah Afridi': 'Shaheen Afridi', 'Z Khan': 'Zaheer Khan', 'Mohammed Shami': 'Mohammad Shami', 'Mehidy Hasan Miraz': 'Mehidy Hasan'}, inplace = True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment