finlytics-hub/ODI_data_munging_2.py

## ODI_data_munging_2.py
# decompose ODI_matches into a long format
melted_data = pd.melt(ODI_matches, id_vars = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'], value_name = 'Players').sort_values(by = ['Match Date', 'Country']).reset_index(drop = True).drop(columns = ['variable'])

# WEB SCRAPING FOR RANKINGS
# instantiate an empty list to store the date-wise DFs
df_list = []

for d in ODI_matches['Match Date'].unique():
    # convert each unique date to a timestamp
    date = pd.Timestamp(d)
    # extract day, month and year information to be used in the URL
    day = date.strftime('%d')
    month = date.strftime('%m')
    year = date.strftime('%Y')
    # an empty dictionary to store the date-wise scrapped batting rankings data
    batting_data = {}
    # base URL to scrape batting ranks
    URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=batting&day={day}&month={month}&year={year}'
    # request page URL
    page = requests.get(URL)
    # get page contents and parse them
    soup = BeautifulSoup(page.content, 'html.parser')
    # find tha rankings table
    results = soup.find(id='top100battest')
    # find ranks from the rankings table
    rank_elems = results.find_all('td', class_ = 'top100id')
    # find player names from the rankings table
    name_elems = results.find_all('td', class_ = 'top100name')
    # append each rank to our batting_data dictionary
    for rank in rank_elems:
        batting_data.setdefault('Batting Rank', []).append(rank.text)
    # append each player name to our batting_data dictionary
    for name in name_elems:
        batting_data.setdefault('Players', []).append(name.text.strip())
    # convert dictionary to a DF
    batting = pd.DataFrame(batting_data)
    # add a date column
    batting['Match Date'] = d
    # convert ranks to numeric
    batting['Batting Rank'] = pd.to_numeric(batting['Batting Rank'])
    # remove '.' from names to match them with our ODI_matches DF
    batting['Players'] = batting['Players'].str.replace('.', '')

    # an empty dictionary to store the date-wise bowling rankings data
    bowling_data = {}
    # base URL to scrape the bowling ranks
    URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=bowling&day={day}&month={month}&year={year}'
    # request page URL
    page = requests.get(URL)
    # get page contents and parse them
    soup = BeautifulSoup(page.content, 'html.parser')
    # find the rankings table
    results = soup.find(id='top100battest')
    # find ranks from the ranking table
    rank_elems = results.find_all('td', class_ = 'top100id')
    # find player names from the ranking table
    name_elems = results.find_all('td', class_ = 'top100name')
    # append each rank to our bowling_data dictionary
    for rank in rank_elems:
        bowling_data.setdefault('Bowling Rank', []).append(rank.text)
    # append each name to our bowling_data dictionary
    for name in name_elems:
        bowling_data.setdefault('Players', []).append(name.text.strip())
    # convert the dictionary to a DF
    bowling = pd.DataFrame(bowling_data)
    # add a date column
    bowling['Match Date'] = d
    # convert ranks to numeric
    bowling['Bowling Rank'] = pd.to_numeric(bowling['Bowling Rank'])
    # remove '.' from names to match them with our ODI_matches DF
    bowling['Players'] = bowling['Players'].str.replace('.', '')

    # get match(es) details played on that date
    date_specific_data = melted_data[melted_data['Match Date'] == d]

    # merge the match details with bowling and batting DFs
    data_rankings = pd.merge(date_specific_data, bowling, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players']).merge(batting, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players'])

    # create a pandas GroupBy object that will allow proper unstacking
    grouped = data_rankings.groupby(by = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'])

    # unstack names of the players into columns and save in a DF
    players_df = grouped['Players'].apply(lambda x: pd.Series(x.values)).unstack()
    # rename the 1st 12 column names to this format: Country_Player_x
    players_df = players_df.rename(columns = {i: 'Country_Player_{}'.format(i + 1) for i in range(0, 12, 1)})
    # rename the next 12 column names to this format: Opposition_Player_x
    players_df = players_df.rename(columns = {i: 'Opposition_Player_{}'.format(i - 11) for i in range(12, 24, 1)})

    # unstack bowling ranks into columns and save in a DF
    bowling_rank_df = grouped['Bowling Rank'].apply(lambda x: pd.Series(x.values)).unstack()
    # rename the 1st 12 column names to this format: Country_Bowling_Rank_x
    bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Country_Bowling Rank_{}'.format(i + 1) for i in range(0, 12, 1)})
    # rename the next 12 column names to this format: Opposition_Bowling_Rank_x
    bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Opposition_Bowling Rank_{}'.format(i - 11) for i in range(12, 24, 1)})

    # unstack batting ranks into columns and save in a DF
    batting_rank_df = grouped['Batting Rank'].apply(lambda x: pd.Series(x.values)).unstack()
    # rename the 1st 12 column names to this format: Country_Batting_Rank_x
    batting_rank_df = batting_rank_df.rename(columns = {i: 'Country_Batting Rank_{}'.format(i + 1) for i in range(0, 12, 1)})
    # rename the next 12 column names to this format: Opposition_Batting_Rank_x
    batting_rank_df = batting_rank_df.rename(columns = {i: 'Opposition_Batting Rank_{}'.format(i - 11) for i in range(12, 24, 1)})

    # concatenate the 3 DFs along columns and reset index
    result = pd.concat([players_df, bowling_rank_df, batting_rank_df], axis = 1)
    result = result.reset_index()

    # append the final date-specific DF to our df_list
    df_list.append(result)

# concatenate the list of DFs into a single DF and reset index
final_result = pd.concat(df_list).reset_index(drop = True)
# convert Match Date to datetime
final_result['Match Date'] = pd.to_datetime(final_result['Match Date'])


## toss_df operations
# convert Match Date column to a datetime format
toss_df['Match Date'] = pd.to_datetime(toss_df['Match Date'], infer_datetime_format = True)
# split 'Match' column on 'v'
toss_df[['Country 1', 'Country 2']] = toss_df['Match'].str.split(' v ', expand = True)
# remove any trailing and leading whitespaces
toss_df['Country 1'] = toss_df['Country 1'].str.strip()
toss_df['Country 2'] = toss_df['Country 2'].str.strip()
# replace some other special characters like line breaks
toss_df['Country 2'] = toss_df['Country 2'].replace('\r\n', '').replace('\t', '')
# drop the redundant 'Match' column
toss_df.drop(columns = ['Match'], inplace = True)
# if we look closely at the 'Match' column, our new 'Country 2' column will have certain extra information after the country name. So let's retain the country name only
toss_df['Country 2'] = toss_df['Country 2'].str.split().str[0].str.strip()
# redo a similar operation on 'Country 1' column.
toss_df['Country 1'] = toss_df['Country 1'].str.split().str[-1].str.strip()
# since we retained only the last word of the 'Country 1' column, we need to rename the countries with multi-word names
toss_df['Country 1'].replace(to_replace = {'Zealand': 'New Zealand', 'Lanka': 'Sri Lanka', 'Africa': 'South Africa', 'Indies': 'West Indies', 'Emirates': 'United Arab Emirates', 'Kong': 'Hong Kong', 'Guinea': 'Papua New Guinea', 'States': 'United States'}, inplace = True)
# strip Ground name from the 'Ground' column (the last word or 2 basically) after creating its copy
toss_df.rename(columns = {'Ground': 'Ground_old'}, inplace = True)
toss_df['Ground'] = toss_df['Ground_old'].str.split().str[-1]
# what follows now is a a result of some manual trial and error to ensure the Ground names in our toss_df is the same as those in final_result
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Lord\'s, London'].index, 6] = 'Lord\'s'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Kennington Oval, London'].index, 6] = 'The Oval'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Buffalo Park, East London'].index, 6] = 'East London'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Albion Sports Complex, Albion, Berbice, Guyana'].index, 6] = 'Albion'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Bourda, Georgetown, Guyana'].index, 6] = 'Georgetown'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Providence Stadium, Providence, Guyana'].index, 6] = 'Providence'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Antigua Recreation Ground, St. John\'s, Antigua'].index, 6] = 'St John\'s'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sir Vivian Richards Stadium, North Sound, Antigua'].index, 6] = 'North Sound'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sinhalese Sports Club Ground, Colombo'].index, 6] = 'Colombo (SSC)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'P Sara Oval, Colombo'].index, 6] = 'Colombo (PSS)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'R Premadasa Stadium, Colombo'].index, 6] = 'Colombo (RPS)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jawaharlal Nehru Stadium, New Delhi, Delhi'].index, 6] = 'New Delhi'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh'].index, 6] = 'Mohali'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Brabourne Stadium, Mumbai'].index, 6] = 'Mumbai (BS)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'The Village, Dublin'].index, 6] = 'Dublin (Malahide)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Dubai International Cricket Stadium, Dubai'].index, 6] = 'Dubai (DSC)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'ICC Global Cricket Academy, Dubai'].index, 6] = 'ICCA Dubai'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Gymkhana Club Ground, Nairobi'].index, 6] = 'Nairobi (Gym)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Nairobi Club Ground, Nairobi'].index, 6] = 'Nairobi (Club)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Aga Khan Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Aga)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Simba Union Ground, Nairobi'].index, 6] = 'Nairobi'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jaffrey Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Jaff)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Ruaraka Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Ruaraka)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Darren Sammy National Cricket Stadium, Gros Islet, St Lucia'].index, 6] = 'Gros Islet'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Mindoo Philip Park, Castries, St. Lucia'].index, 6] = 'Castries'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Pallekele International Cricket Stadium, Pallekele, Kandy'].index, 6] = 'Pallekele'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Etihad Stadium, Melbourne'].index, 6] = 'Melbourne (Docklands)'
toss_df['Ground'].replace(to_replace = {'(Jullundur)': 'Jullundur', '(Sind)': 'Hyderabad (Sind)', 'Grenada': 'St George\'s', 'Wells': 'Tunbridge Wells', '(Deccan)': 'Hyderabad (Deccan)', 'Hyderabad': 'Hyderabad (Deccan)', '(Baroda)': 'Vadodara', 'Jamaica': 'Kingston', '(Lyallpur)': 'Faisalabad', 'Barbados': 'Bridgetown', '(Dacca)': 'Dhaka', 'Kitts': 'Basseterre', 'Lumpur': 'Kuala Lumpur', 'Ontario': 'King City (NW)', 'Dominica': 'Roseau', 'Voorburg': 'The Hague', 'Maunganui': 'Mount Maunganui', 'Noida': 'Greater Noida', 'Vincent': 'Kingstown', 'Dun': 'Dehradun', 'Bangalore': 'Bengaluru', 'Gujwranwala': 'Gujranwala', 'Trinidad': 'Port of Spain', 'Chittagong': 'Chattogram', 'Plymouth': 'New Plymouth', 'Town': 'Cape Town', 'Elizabeth': 'Port Elizabeth', 'Dhabi': 'Abu Dhabi', 'Chester-Le-Street': 'Chester-le-Street'}, inplace = True)
# remove all records that have 'Result:' in the 'Toss Won By' column
idx = toss_df[toss_df['Toss Won By'] == 'Result'].index
toss_df.drop(index = idx, inplace = True)
# sort by date
toss_df.sort_values(by = 'Match Date', ignore_index = True, inplace = True)

# next we will merge our toss_df with final_result after dropping the redundant columns
toss_df.drop(columns = ['Match Code', 'Ground_old', 'Country 1', 'Country 2'], inplace = True)
final_data = pd.merge(final_result, toss_df, how = 'left', left_on = ['Match Date', 'Ground'], right_on = ['Match Date', 'Ground'])
	# decompose ODI_matches into a long format
	melted_data = pd.melt(ODI_matches, id_vars = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'], value_name = 'Players').sort_values(by = ['Match Date', 'Country']).reset_index(drop = True).drop(columns = ['variable'])

	# WEB SCRAPING FOR RANKINGS
	# instantiate an empty list to store the date-wise DFs
	df_list = []

	for d in ODI_matches['Match Date'].unique():
	# convert each unique date to a timestamp
	date = pd.Timestamp(d)
	# extract day, month and year information to be used in the URL
	day = date.strftime('%d')
	month = date.strftime('%m')
	year = date.strftime('%Y')
	# an empty dictionary to store the date-wise scrapped batting rankings data
	batting_data = {}
	# base URL to scrape batting ranks
	URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=batting&day={day}&month={month}&year={year}'
	# request page URL
	page = requests.get(URL)
	# get page contents and parse them
	soup = BeautifulSoup(page.content, 'html.parser')
	# find tha rankings table
	results = soup.find(id='top100battest')
	# find ranks from the rankings table
	rank_elems = results.find_all('td', class_ = 'top100id')
	# find player names from the rankings table
	name_elems = results.find_all('td', class_ = 'top100name')
	# append each rank to our batting_data dictionary
	for rank in rank_elems:
	batting_data.setdefault('Batting Rank', []).append(rank.text)
	# append each player name to our batting_data dictionary
	for name in name_elems:
	batting_data.setdefault('Players', []).append(name.text.strip())
	# convert dictionary to a DF
	batting = pd.DataFrame(batting_data)
	# add a date column
	batting['Match Date'] = d
	# convert ranks to numeric
	batting['Batting Rank'] = pd.to_numeric(batting['Batting Rank'])
	# remove '.' from names to match them with our ODI_matches DF
	batting['Players'] = batting['Players'].str.replace('.', '')

	# an empty dictionary to store the date-wise bowling rankings data
	bowling_data = {}
	# base URL to scrape the bowling ranks
	URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=bowling&day={day}&month={month}&year={year}'
	# request page URL
	page = requests.get(URL)
	# get page contents and parse them
	soup = BeautifulSoup(page.content, 'html.parser')
	# find the rankings table
	results = soup.find(id='top100battest')
	# find ranks from the ranking table
	rank_elems = results.find_all('td', class_ = 'top100id')
	# find player names from the ranking table
	name_elems = results.find_all('td', class_ = 'top100name')
	# append each rank to our bowling_data dictionary
	for rank in rank_elems:
	bowling_data.setdefault('Bowling Rank', []).append(rank.text)
	# append each name to our bowling_data dictionary
	for name in name_elems:
	bowling_data.setdefault('Players', []).append(name.text.strip())
	# convert the dictionary to a DF
	bowling = pd.DataFrame(bowling_data)
	# add a date column
	bowling['Match Date'] = d
	# convert ranks to numeric
	bowling['Bowling Rank'] = pd.to_numeric(bowling['Bowling Rank'])
	# remove '.' from names to match them with our ODI_matches DF
	bowling['Players'] = bowling['Players'].str.replace('.', '')

	# get match(es) details played on that date
	date_specific_data = melted_data[melted_data['Match Date'] == d]

	# merge the match details with bowling and batting DFs
	data_rankings = pd.merge(date_specific_data, bowling, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players']).merge(batting, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players'])

	# create a pandas GroupBy object that will allow proper unstacking
	grouped = data_rankings.groupby(by = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'])

	# unstack names of the players into columns and save in a DF
	players_df = grouped['Players'].apply(lambda x: pd.Series(x.values)).unstack()
	# rename the 1st 12 column names to this format: Country_Player_x
	players_df = players_df.rename(columns = {i: 'Country_Player_{}'.format(i + 1) for i in range(0, 12, 1)})
	# rename the next 12 column names to this format: Opposition_Player_x
	players_df = players_df.rename(columns = {i: 'Opposition_Player_{}'.format(i - 11) for i in range(12, 24, 1)})

	# unstack bowling ranks into columns and save in a DF
	bowling_rank_df = grouped['Bowling Rank'].apply(lambda x: pd.Series(x.values)).unstack()
	# rename the 1st 12 column names to this format: Country_Bowling_Rank_x
	bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Country_Bowling Rank_{}'.format(i + 1) for i in range(0, 12, 1)})
	# rename the next 12 column names to this format: Opposition_Bowling_Rank_x
	bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Opposition_Bowling Rank_{}'.format(i - 11) for i in range(12, 24, 1)})

	# unstack batting ranks into columns and save in a DF
	batting_rank_df = grouped['Batting Rank'].apply(lambda x: pd.Series(x.values)).unstack()
	# rename the 1st 12 column names to this format: Country_Batting_Rank_x
	batting_rank_df = batting_rank_df.rename(columns = {i: 'Country_Batting Rank_{}'.format(i + 1) for i in range(0, 12, 1)})
	# rename the next 12 column names to this format: Opposition_Batting_Rank_x
	batting_rank_df = batting_rank_df.rename(columns = {i: 'Opposition_Batting Rank_{}'.format(i - 11) for i in range(12, 24, 1)})

	# concatenate the 3 DFs along columns and reset index
	result = pd.concat([players_df, bowling_rank_df, batting_rank_df], axis = 1)
	result = result.reset_index()

	# append the final date-specific DF to our df_list
	df_list.append(result)

	# concatenate the list of DFs into a single DF and reset index
	final_result = pd.concat(df_list).reset_index(drop = True)
	# convert Match Date to datetime
	final_result['Match Date'] = pd.to_datetime(final_result['Match Date'])


	## toss_df operations
	# convert Match Date column to a datetime format
	toss_df['Match Date'] = pd.to_datetime(toss_df['Match Date'], infer_datetime_format = True)
	# split 'Match' column on 'v'
	toss_df[['Country 1', 'Country 2']] = toss_df['Match'].str.split(' v ', expand = True)
	# remove any trailing and leading whitespaces
	toss_df['Country 1'] = toss_df['Country 1'].str.strip()
	toss_df['Country 2'] = toss_df['Country 2'].str.strip()
	# replace some other special characters like line breaks
	toss_df['Country 2'] = toss_df['Country 2'].replace('\r\n', '').replace('\t', '')
	# drop the redundant 'Match' column
	toss_df.drop(columns = ['Match'], inplace = True)
	# if we look closely at the 'Match' column, our new 'Country 2' column will have certain extra information after the country name. So let's retain the country name only
	toss_df['Country 2'] = toss_df['Country 2'].str.split().str[0].str.strip()
	# redo a similar operation on 'Country 1' column.
	toss_df['Country 1'] = toss_df['Country 1'].str.split().str[-1].str.strip()
	# since we retained only the last word of the 'Country 1' column, we need to rename the countries with multi-word names
	toss_df['Country 1'].replace(to_replace = {'Zealand': 'New Zealand', 'Lanka': 'Sri Lanka', 'Africa': 'South Africa', 'Indies': 'West Indies', 'Emirates': 'United Arab Emirates', 'Kong': 'Hong Kong', 'Guinea': 'Papua New Guinea', 'States': 'United States'}, inplace = True)
	# strip Ground name from the 'Ground' column (the last word or 2 basically) after creating its copy
	toss_df.rename(columns = {'Ground': 'Ground_old'}, inplace = True)
	toss_df['Ground'] = toss_df['Ground_old'].str.split().str[-1]
	# what follows now is a a result of some manual trial and error to ensure the Ground names in our toss_df is the same as those in final_result
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Lord\'s, London'].index, 6] = 'Lord\'s'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Kennington Oval, London'].index, 6] = 'The Oval'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Buffalo Park, East London'].index, 6] = 'East London'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Albion Sports Complex, Albion, Berbice, Guyana'].index, 6] = 'Albion'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Bourda, Georgetown, Guyana'].index, 6] = 'Georgetown'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Providence Stadium, Providence, Guyana'].index, 6] = 'Providence'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Antigua Recreation Ground, St. John\'s, Antigua'].index, 6] = 'St John\'s'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sir Vivian Richards Stadium, North Sound, Antigua'].index, 6] = 'North Sound'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sinhalese Sports Club Ground, Colombo'].index, 6] = 'Colombo (SSC)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'P Sara Oval, Colombo'].index, 6] = 'Colombo (PSS)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'R Premadasa Stadium, Colombo'].index, 6] = 'Colombo (RPS)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jawaharlal Nehru Stadium, New Delhi, Delhi'].index, 6] = 'New Delhi'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh'].index, 6] = 'Mohali'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Brabourne Stadium, Mumbai'].index, 6] = 'Mumbai (BS)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'The Village, Dublin'].index, 6] = 'Dublin (Malahide)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Dubai International Cricket Stadium, Dubai'].index, 6] = 'Dubai (DSC)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'ICC Global Cricket Academy, Dubai'].index, 6] = 'ICCA Dubai'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Gymkhana Club Ground, Nairobi'].index, 6] = 'Nairobi (Gym)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Nairobi Club Ground, Nairobi'].index, 6] = 'Nairobi (Club)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Aga Khan Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Aga)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Simba Union Ground, Nairobi'].index, 6] = 'Nairobi'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jaffrey Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Jaff)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Ruaraka Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Ruaraka)'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Darren Sammy National Cricket Stadium, Gros Islet, St Lucia'].index, 6] = 'Gros Islet'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Mindoo Philip Park, Castries, St. Lucia'].index, 6] = 'Castries'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Pallekele International Cricket Stadium, Pallekele, Kandy'].index, 6] = 'Pallekele'
	toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Etihad Stadium, Melbourne'].index, 6] = 'Melbourne (Docklands)'
	toss_df['Ground'].replace(to_replace = {'(Jullundur)': 'Jullundur', '(Sind)': 'Hyderabad (Sind)', 'Grenada': 'St George\'s', 'Wells': 'Tunbridge Wells', '(Deccan)': 'Hyderabad (Deccan)', 'Hyderabad': 'Hyderabad (Deccan)', '(Baroda)': 'Vadodara', 'Jamaica': 'Kingston', '(Lyallpur)': 'Faisalabad', 'Barbados': 'Bridgetown', '(Dacca)': 'Dhaka', 'Kitts': 'Basseterre', 'Lumpur': 'Kuala Lumpur', 'Ontario': 'King City (NW)', 'Dominica': 'Roseau', 'Voorburg': 'The Hague', 'Maunganui': 'Mount Maunganui', 'Noida': 'Greater Noida', 'Vincent': 'Kingstown', 'Dun': 'Dehradun', 'Bangalore': 'Bengaluru', 'Gujwranwala': 'Gujranwala', 'Trinidad': 'Port of Spain', 'Chittagong': 'Chattogram', 'Plymouth': 'New Plymouth', 'Town': 'Cape Town', 'Elizabeth': 'Port Elizabeth', 'Dhabi': 'Abu Dhabi', 'Chester-Le-Street': 'Chester-le-Street'}, inplace = True)
	# remove all records that have 'Result:' in the 'Toss Won By' column
	idx = toss_df[toss_df['Toss Won By'] == 'Result'].index
	toss_df.drop(index = idx, inplace = True)
	# sort by date
	toss_df.sort_values(by = 'Match Date', ignore_index = True, inplace = True)

	# next we will merge our toss_df with final_result after dropping the redundant columns
	toss_df.drop(columns = ['Match Code', 'Ground_old', 'Country 1', 'Country 2'], inplace = True)
	final_data = pd.merge(final_result, toss_df, how = 'left', left_on = ['Match Date', 'Ground'], right_on = ['Match Date', 'Ground'])