Last active
October 2, 2020 17:21
-
-
Save finlytics-hub/07b878a33fdc85af1c7bfa4bc57cb9b1 to your computer and use it in GitHub Desktop.
ODI: data munging Part 1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ODI_results DF | |
# create a copy of the DF | |
ODI_results_clean = ODI_results.copy() | |
# split Match column into two separate columns | |
ODI_results_clean[['Country 1', 'Country 2']] = ODI_results_clean['Match'].str.split('v', expand = True) | |
# strip out all the leading and trailing whitespaces | |
ODI_results_clean['Country 1'] = ODI_results_clean['Country 1'].str.strip() | |
ODI_results_clean['Country 2'] = ODI_results_clean['Country 2'].str.strip() | |
# create the Opposition column using information from the 2 newly created columns | |
ODI_results_clean['Opposition'] = np.where(ODI_results_clean['Country 1'] == ODI_results_clean['Country'], ODI_results_clean['Country 2'], ODI_results_clean['Country 1']) | |
# drop the 2 new columns together with other unwanted columns | |
ODI_results_clean.drop(columns = ['Margin', 'Match', 'Match Month', 'Match Year', 'Match Period', 'Matches', 'Country 1', 'Country 2'], inplace = True) | |
# filter out any match results other than 'Won' or 'Lost' so as not to consider matches not played, tied and with no result in our modeling | |
ODI_results_clean = ODI_results_clean[ODI_results_clean['Result'].isin(['Won', 'Lost'])] | |
# convert the Match Date column to a datetime format | |
ODI_results_clean['Match Date'] = pd.to_datetime(ODI_results_clean['Match Date']) | |
# sort by Match Date | |
ODI_results_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True) | |
# ODI_innings DF | |
# create a copy of the DF | |
ODI_innings_clean = ODI_innings.copy() | |
# drop duplicate records based on a subset of columns only | |
ODI_innings_clean.drop_duplicates(subset = ['Innings Player', 'Country', 'Opposition', 'Innings Date', 'Ground'], inplace = True) | |
# drop all the unwanted columns | |
col_to_drop = ODI_innings_clean.iloc[:, np.r_[1:11, 15:28]].columns.values | |
ODI_innings_clean.drop(columns = col_to_drop, inplace = True) | |
# remove 'v' from the Opposition columns | |
ODI_innings_clean['Opposition'] = ODI_innings_clean['Opposition'].str.replace('v', '') | |
# rename certain columns | |
ODI_innings_clean.rename(columns = {'Innings Date': 'Match Date', 'Innings Player': 'Players'}, inplace = True) | |
# convert the Match Date column to a datetime format | |
ODI_innings_clean['Match Date'] = pd.to_datetime(ODI_innings_clean['Match Date']) | |
# sort by Match Date | |
ODI_innings_clean.sort_values(by = 'Match Date', inplace = True, ignore_index = True) | |
# first merge the 2 on Country level to get Country's players merged with ODI_results_clean. note the use of 'left' join here and not an 'inner' join as we want to retain all the results information from the 'left' DF even if the corresponding team's players information is not available | |
ODI_country_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Country'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Opposition_y']) | |
# rename columns | |
ODI_country_players.rename(columns = {'Players': 'Country Players', 'Opposition_x': 'Opposition'}, inplace = True) | |
# next merge on the Opposition columns to get the names of the Opposition players | |
ODI_opposition_players = pd.merge(ODI_results_clean, ODI_innings_clean, how = 'left', left_on = ['Match Date', 'Ground', 'Opposition'], right_on = ['Match Date', 'Ground', 'Country']).drop(columns = ['Country_y', 'Opposition_y']) | |
# rename columns | |
ODI_opposition_players.rename(columns = {'Players': 'Opposition Players', 'Country_x': 'Country', 'Opposition_x': 'Opposition'}, inplace = True) | |
# define a function to change (or unstack) our DFs | |
def df_unstack(df, players_col, col_name): | |
grouped = df.groupby(by = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result']) | |
df_flat = grouped[players_col].apply(lambda x: pd.Series(x.values)).unstack().reset_index() | |
df_flat.rename(columns = {i: col_name + '_{}'.format(i + 1) for i in range(df_flat.shape[1])}, inplace = True) | |
return df_flat | |
# apply the function to the 2 DFs | |
ODI_country_players_flat = df_unstack(ODI_country_players, 'Country Players', 'Country_Player') | |
ODI_opposition_players_flat = df_unstack(ODI_opposition_players, 'Opposition Players', 'Opposition_Player') | |
# drop records with no players information | |
ODI_country_players_flat.dropna(subset = ['Country_Player_1'], inplace = True) | |
ODI_opposition_players_flat.dropna(subset = ['Opposition_Player_1'], inplace = True) | |
# another merge | |
ODI_matches = pd.merge(ODI_country_players_flat, ODI_opposition_players_flat, how = 'inner', on = ['Match Date', 'Country', 'Opposition', 'Ground', 'Home/Away', 'Result']) | |
# replace some of the players' names that I found through trial and error to make sure they are same as per the ICC Rankings data | |
ODI_matches.replace(to_replace = {'Sir RJ Hadlee': 'RJ Hadlee', 'Saleem Altaf': 'Salim Altaf', 'Saleem Pervez': 'Salim Pervez', 'Saleem Malik': 'Salim Malik', 'Saleem Yousuf': 'Salim Yousuf', 'Saleem Jaffar': 'Salim Jaffar', 'Saleem Elahi': 'Salim Elahi', 'Aamer Sohail': 'Aamir Sohail', 'Ijaz Ahmed': 'Ijaz Ahmed, sen.', 'Aaqib Javed': 'Aqib Javed', 'Rameez Raja': 'Ramiz Raja', 'Haroon Rasheed': 'Haroon Rashid', 'Shakeel Khan': 'Shakil Khan', 'Shaheen Shah Afridi': 'Shaheen Afridi', 'Z Khan': 'Zaheer Khan', 'Mohammed Shami': 'Mohammad Shami', 'Mehidy Hasan Miraz': 'Mehidy Hasan'}, inplace = True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment