Last active
October 2, 2020 17:25
-
-
Save finlytics-hub/ec719e6fb4690870bb66ab052127a17b to your computer and use it in GitHub Desktop.
ODI: Data Munging Part 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# decompose ODI_matches into a long format | |
melted_data = pd.melt(ODI_matches, id_vars = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'], value_name = 'Players').sort_values(by = ['Match Date', 'Country']).reset_index(drop = True).drop(columns = ['variable']) | |
# WEB SCRAPING FOR RANKINGS | |
# instantiate an empty list to store the date-wise DFs | |
df_list = [] | |
for d in ODI_matches['Match Date'].unique(): | |
# convert each unique date to a timestamp | |
date = pd.Timestamp(d) | |
# extract day, month and year information to be used in the URL | |
day = date.strftime('%d') | |
month = date.strftime('%m') | |
year = date.strftime('%Y') | |
# an empty dictionary to store the date-wise scrapped batting rankings data | |
batting_data = {} | |
# base URL to scrape batting ranks | |
URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=batting&day={day}&month={month}&year={year}' | |
# request page URL | |
page = requests.get(URL) | |
# get page contents and parse them | |
soup = BeautifulSoup(page.content, 'html.parser') | |
# find tha rankings table | |
results = soup.find(id='top100battest') | |
# find ranks from the rankings table | |
rank_elems = results.find_all('td', class_ = 'top100id') | |
# find player names from the rankings table | |
name_elems = results.find_all('td', class_ = 'top100name') | |
# append each rank to our batting_data dictionary | |
for rank in rank_elems: | |
batting_data.setdefault('Batting Rank', []).append(rank.text) | |
# append each player name to our batting_data dictionary | |
for name in name_elems: | |
batting_data.setdefault('Players', []).append(name.text.strip()) | |
# convert dictionary to a DF | |
batting = pd.DataFrame(batting_data) | |
# add a date column | |
batting['Match Date'] = d | |
# convert ranks to numeric | |
batting['Batting Rank'] = pd.to_numeric(batting['Batting Rank']) | |
# remove '.' from names to match them with our ODI_matches DF | |
batting['Players'] = batting['Players'].str.replace('.', '') | |
# an empty dictionary to store the date-wise bowling rankings data | |
bowling_data = {} | |
# base URL to scrape the bowling ranks | |
URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=bowling&day={day}&month={month}&year={year}' | |
# request page URL | |
page = requests.get(URL) | |
# get page contents and parse them | |
soup = BeautifulSoup(page.content, 'html.parser') | |
# find the rankings table | |
results = soup.find(id='top100battest') | |
# find ranks from the ranking table | |
rank_elems = results.find_all('td', class_ = 'top100id') | |
# find player names from the ranking table | |
name_elems = results.find_all('td', class_ = 'top100name') | |
# append each rank to our bowling_data dictionary | |
for rank in rank_elems: | |
bowling_data.setdefault('Bowling Rank', []).append(rank.text) | |
# append each name to our bowling_data dictionary | |
for name in name_elems: | |
bowling_data.setdefault('Players', []).append(name.text.strip()) | |
# convert the dictionary to a DF | |
bowling = pd.DataFrame(bowling_data) | |
# add a date column | |
bowling['Match Date'] = d | |
# convert ranks to numeric | |
bowling['Bowling Rank'] = pd.to_numeric(bowling['Bowling Rank']) | |
# remove '.' from names to match them with our ODI_matches DF | |
bowling['Players'] = bowling['Players'].str.replace('.', '') | |
# get match(es) details played on that date | |
date_specific_data = melted_data[melted_data['Match Date'] == d] | |
# merge the match details with bowling and batting DFs | |
data_rankings = pd.merge(date_specific_data, bowling, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players']).merge(batting, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players']) | |
# create a pandas GroupBy object that will allow proper unstacking | |
grouped = data_rankings.groupby(by = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result']) | |
# unstack names of the players into columns and save in a DF | |
players_df = grouped['Players'].apply(lambda x: pd.Series(x.values)).unstack() | |
# rename the 1st 12 column names to this format: Country_Player_x | |
players_df = players_df.rename(columns = {i: 'Country_Player_{}'.format(i + 1) for i in range(0, 12, 1)}) | |
# rename the next 12 column names to this format: Opposition_Player_x | |
players_df = players_df.rename(columns = {i: 'Opposition_Player_{}'.format(i - 11) for i in range(12, 24, 1)}) | |
# unstack bowling ranks into columns and save in a DF | |
bowling_rank_df = grouped['Bowling Rank'].apply(lambda x: pd.Series(x.values)).unstack() | |
# rename the 1st 12 column names to this format: Country_Bowling_Rank_x | |
bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Country_Bowling Rank_{}'.format(i + 1) for i in range(0, 12, 1)}) | |
# rename the next 12 column names to this format: Opposition_Bowling_Rank_x | |
bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Opposition_Bowling Rank_{}'.format(i - 11) for i in range(12, 24, 1)}) | |
# unstack batting ranks into columns and save in a DF | |
batting_rank_df = grouped['Batting Rank'].apply(lambda x: pd.Series(x.values)).unstack() | |
# rename the 1st 12 column names to this format: Country_Batting_Rank_x | |
batting_rank_df = batting_rank_df.rename(columns = {i: 'Country_Batting Rank_{}'.format(i + 1) for i in range(0, 12, 1)}) | |
# rename the next 12 column names to this format: Opposition_Batting_Rank_x | |
batting_rank_df = batting_rank_df.rename(columns = {i: 'Opposition_Batting Rank_{}'.format(i - 11) for i in range(12, 24, 1)}) | |
# concatenate the 3 DFs along columns and reset index | |
result = pd.concat([players_df, bowling_rank_df, batting_rank_df], axis = 1) | |
result = result.reset_index() | |
# append the final date-specific DF to our df_list | |
df_list.append(result) | |
# concatenate the list of DFs into a single DF and reset index | |
final_result = pd.concat(df_list).reset_index(drop = True) | |
# convert Match Date to datetime | |
final_result['Match Date'] = pd.to_datetime(final_result['Match Date']) | |
## toss_df operations | |
# convert Match Date column to a datetime format | |
toss_df['Match Date'] = pd.to_datetime(toss_df['Match Date'], infer_datetime_format = True) | |
# split 'Match' column on 'v' | |
toss_df[['Country 1', 'Country 2']] = toss_df['Match'].str.split(' v ', expand = True) | |
# remove any trailing and leading whitespaces | |
toss_df['Country 1'] = toss_df['Country 1'].str.strip() | |
toss_df['Country 2'] = toss_df['Country 2'].str.strip() | |
# replace some other special characters like line breaks | |
toss_df['Country 2'] = toss_df['Country 2'].replace('\r\n', '').replace('\t', '') | |
# drop the redundant 'Match' column | |
toss_df.drop(columns = ['Match'], inplace = True) | |
# if we look closely at the 'Match' column, our new 'Country 2' column will have certain extra information after the country name. So let's retain the country name only | |
toss_df['Country 2'] = toss_df['Country 2'].str.split().str[0].str.strip() | |
# redo a similar operation on 'Country 1' column. | |
toss_df['Country 1'] = toss_df['Country 1'].str.split().str[-1].str.strip() | |
# since we retained only the last word of the 'Country 1' column, we need to rename the countries with multi-word names | |
toss_df['Country 1'].replace(to_replace = {'Zealand': 'New Zealand', 'Lanka': 'Sri Lanka', 'Africa': 'South Africa', 'Indies': 'West Indies', 'Emirates': 'United Arab Emirates', 'Kong': 'Hong Kong', 'Guinea': 'Papua New Guinea', 'States': 'United States'}, inplace = True) | |
# strip Ground name from the 'Ground' column (the last word or 2 basically) after creating its copy | |
toss_df.rename(columns = {'Ground': 'Ground_old'}, inplace = True) | |
toss_df['Ground'] = toss_df['Ground_old'].str.split().str[-1] | |
# what follows now is a a result of some manual trial and error to ensure the Ground names in our toss_df is the same as those in final_result | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Lord\'s, London'].index, 6] = 'Lord\'s' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Kennington Oval, London'].index, 6] = 'The Oval' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Buffalo Park, East London'].index, 6] = 'East London' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Albion Sports Complex, Albion, Berbice, Guyana'].index, 6] = 'Albion' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Bourda, Georgetown, Guyana'].index, 6] = 'Georgetown' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Providence Stadium, Providence, Guyana'].index, 6] = 'Providence' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Antigua Recreation Ground, St. John\'s, Antigua'].index, 6] = 'St John\'s' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sir Vivian Richards Stadium, North Sound, Antigua'].index, 6] = 'North Sound' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sinhalese Sports Club Ground, Colombo'].index, 6] = 'Colombo (SSC)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'P Sara Oval, Colombo'].index, 6] = 'Colombo (PSS)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'R Premadasa Stadium, Colombo'].index, 6] = 'Colombo (RPS)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jawaharlal Nehru Stadium, New Delhi, Delhi'].index, 6] = 'New Delhi' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh'].index, 6] = 'Mohali' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Brabourne Stadium, Mumbai'].index, 6] = 'Mumbai (BS)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'The Village, Dublin'].index, 6] = 'Dublin (Malahide)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Dubai International Cricket Stadium, Dubai'].index, 6] = 'Dubai (DSC)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'ICC Global Cricket Academy, Dubai'].index, 6] = 'ICCA Dubai' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Gymkhana Club Ground, Nairobi'].index, 6] = 'Nairobi (Gym)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Nairobi Club Ground, Nairobi'].index, 6] = 'Nairobi (Club)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Aga Khan Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Aga)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Simba Union Ground, Nairobi'].index, 6] = 'Nairobi' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jaffrey Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Jaff)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Ruaraka Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Ruaraka)' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Darren Sammy National Cricket Stadium, Gros Islet, St Lucia'].index, 6] = 'Gros Islet' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Mindoo Philip Park, Castries, St. Lucia'].index, 6] = 'Castries' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Pallekele International Cricket Stadium, Pallekele, Kandy'].index, 6] = 'Pallekele' | |
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Etihad Stadium, Melbourne'].index, 6] = 'Melbourne (Docklands)' | |
toss_df['Ground'].replace(to_replace = {'(Jullundur)': 'Jullundur', '(Sind)': 'Hyderabad (Sind)', 'Grenada': 'St George\'s', 'Wells': 'Tunbridge Wells', '(Deccan)': 'Hyderabad (Deccan)', 'Hyderabad': 'Hyderabad (Deccan)', '(Baroda)': 'Vadodara', 'Jamaica': 'Kingston', '(Lyallpur)': 'Faisalabad', 'Barbados': 'Bridgetown', '(Dacca)': 'Dhaka', 'Kitts': 'Basseterre', 'Lumpur': 'Kuala Lumpur', 'Ontario': 'King City (NW)', 'Dominica': 'Roseau', 'Voorburg': 'The Hague', 'Maunganui': 'Mount Maunganui', 'Noida': 'Greater Noida', 'Vincent': 'Kingstown', 'Dun': 'Dehradun', 'Bangalore': 'Bengaluru', 'Gujwranwala': 'Gujranwala', 'Trinidad': 'Port of Spain', 'Chittagong': 'Chattogram', 'Plymouth': 'New Plymouth', 'Town': 'Cape Town', 'Elizabeth': 'Port Elizabeth', 'Dhabi': 'Abu Dhabi', 'Chester-Le-Street': 'Chester-le-Street'}, inplace = True) | |
# remove all records that have 'Result:' in the 'Toss Won By' column | |
idx = toss_df[toss_df['Toss Won By'] == 'Result'].index | |
toss_df.drop(index = idx, inplace = True) | |
# sort by date | |
toss_df.sort_values(by = 'Match Date', ignore_index = True, inplace = True) | |
# next we will merge our toss_df with final_result after dropping the redundant columns | |
toss_df.drop(columns = ['Match Code', 'Ground_old', 'Country 1', 'Country 2'], inplace = True) | |
final_data = pd.merge(final_result, toss_df, how = 'left', left_on = ['Match Date', 'Ground'], right_on = ['Match Date', 'Ground']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment