Skip to content

Instantly share code, notes, and snippets.

@finlytics-hub
Last active October 2, 2020 17:25
Show Gist options
  • Save finlytics-hub/ec719e6fb4690870bb66ab052127a17b to your computer and use it in GitHub Desktop.
Save finlytics-hub/ec719e6fb4690870bb66ab052127a17b to your computer and use it in GitHub Desktop.
ODI: Data Munging Part 2
# decompose ODI_matches into a long format
melted_data = pd.melt(ODI_matches, id_vars = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'], value_name = 'Players').sort_values(by = ['Match Date', 'Country']).reset_index(drop = True).drop(columns = ['variable'])
# WEB SCRAPING FOR RANKINGS
# instantiate an empty list to store the date-wise DFs
df_list = []
for d in ODI_matches['Match Date'].unique():
# convert each unique date to a timestamp
date = pd.Timestamp(d)
# extract day, month and year information to be used in the URL
day = date.strftime('%d')
month = date.strftime('%m')
year = date.strftime('%Y')
# an empty dictionary to store the date-wise scrapped batting rankings data
batting_data = {}
# base URL to scrape batting ranks
URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=batting&day={day}&month={month}&year={year}'
# request page URL
page = requests.get(URL)
# get page contents and parse them
soup = BeautifulSoup(page.content, 'html.parser')
# find tha rankings table
results = soup.find(id='top100battest')
# find ranks from the rankings table
rank_elems = results.find_all('td', class_ = 'top100id')
# find player names from the rankings table
name_elems = results.find_all('td', class_ = 'top100name')
# append each rank to our batting_data dictionary
for rank in rank_elems:
batting_data.setdefault('Batting Rank', []).append(rank.text)
# append each player name to our batting_data dictionary
for name in name_elems:
batting_data.setdefault('Players', []).append(name.text.strip())
# convert dictionary to a DF
batting = pd.DataFrame(batting_data)
# add a date column
batting['Match Date'] = d
# convert ranks to numeric
batting['Batting Rank'] = pd.to_numeric(batting['Batting Rank'])
# remove '.' from names to match them with our ODI_matches DF
batting['Players'] = batting['Players'].str.replace('.', '')
# an empty dictionary to store the date-wise bowling rankings data
bowling_data = {}
# base URL to scrape the bowling ranks
URL = f'http://www.relianceiccrankings.com/datespecific/odi/?stattype=bowling&day={day}&month={month}&year={year}'
# request page URL
page = requests.get(URL)
# get page contents and parse them
soup = BeautifulSoup(page.content, 'html.parser')
# find the rankings table
results = soup.find(id='top100battest')
# find ranks from the ranking table
rank_elems = results.find_all('td', class_ = 'top100id')
# find player names from the ranking table
name_elems = results.find_all('td', class_ = 'top100name')
# append each rank to our bowling_data dictionary
for rank in rank_elems:
bowling_data.setdefault('Bowling Rank', []).append(rank.text)
# append each name to our bowling_data dictionary
for name in name_elems:
bowling_data.setdefault('Players', []).append(name.text.strip())
# convert the dictionary to a DF
bowling = pd.DataFrame(bowling_data)
# add a date column
bowling['Match Date'] = d
# convert ranks to numeric
bowling['Bowling Rank'] = pd.to_numeric(bowling['Bowling Rank'])
# remove '.' from names to match them with our ODI_matches DF
bowling['Players'] = bowling['Players'].str.replace('.', '')
# get match(es) details played on that date
date_specific_data = melted_data[melted_data['Match Date'] == d]
# merge the match details with bowling and batting DFs
data_rankings = pd.merge(date_specific_data, bowling, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players']).merge(batting, how = 'left', left_on = ['Match Date','Players'], right_on = ['Match Date','Players'])
# create a pandas GroupBy object that will allow proper unstacking
grouped = data_rankings.groupby(by = ['Country', 'Opposition', 'Home/Away', 'Ground', 'Match Date', 'Result'])
# unstack names of the players into columns and save in a DF
players_df = grouped['Players'].apply(lambda x: pd.Series(x.values)).unstack()
# rename the 1st 12 column names to this format: Country_Player_x
players_df = players_df.rename(columns = {i: 'Country_Player_{}'.format(i + 1) for i in range(0, 12, 1)})
# rename the next 12 column names to this format: Opposition_Player_x
players_df = players_df.rename(columns = {i: 'Opposition_Player_{}'.format(i - 11) for i in range(12, 24, 1)})
# unstack bowling ranks into columns and save in a DF
bowling_rank_df = grouped['Bowling Rank'].apply(lambda x: pd.Series(x.values)).unstack()
# rename the 1st 12 column names to this format: Country_Bowling_Rank_x
bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Country_Bowling Rank_{}'.format(i + 1) for i in range(0, 12, 1)})
# rename the next 12 column names to this format: Opposition_Bowling_Rank_x
bowling_rank_df = bowling_rank_df.rename(columns = {i: 'Opposition_Bowling Rank_{}'.format(i - 11) for i in range(12, 24, 1)})
# unstack batting ranks into columns and save in a DF
batting_rank_df = grouped['Batting Rank'].apply(lambda x: pd.Series(x.values)).unstack()
# rename the 1st 12 column names to this format: Country_Batting_Rank_x
batting_rank_df = batting_rank_df.rename(columns = {i: 'Country_Batting Rank_{}'.format(i + 1) for i in range(0, 12, 1)})
# rename the next 12 column names to this format: Opposition_Batting_Rank_x
batting_rank_df = batting_rank_df.rename(columns = {i: 'Opposition_Batting Rank_{}'.format(i - 11) for i in range(12, 24, 1)})
# concatenate the 3 DFs along columns and reset index
result = pd.concat([players_df, bowling_rank_df, batting_rank_df], axis = 1)
result = result.reset_index()
# append the final date-specific DF to our df_list
df_list.append(result)
# concatenate the list of DFs into a single DF and reset index
final_result = pd.concat(df_list).reset_index(drop = True)
# convert Match Date to datetime
final_result['Match Date'] = pd.to_datetime(final_result['Match Date'])
## toss_df operations
# convert Match Date column to a datetime format
toss_df['Match Date'] = pd.to_datetime(toss_df['Match Date'], infer_datetime_format = True)
# split 'Match' column on 'v'
toss_df[['Country 1', 'Country 2']] = toss_df['Match'].str.split(' v ', expand = True)
# remove any trailing and leading whitespaces
toss_df['Country 1'] = toss_df['Country 1'].str.strip()
toss_df['Country 2'] = toss_df['Country 2'].str.strip()
# replace some other special characters like line breaks
toss_df['Country 2'] = toss_df['Country 2'].replace('\r\n', '').replace('\t', '')
# drop the redundant 'Match' column
toss_df.drop(columns = ['Match'], inplace = True)
# if we look closely at the 'Match' column, our new 'Country 2' column will have certain extra information after the country name. So let's retain the country name only
toss_df['Country 2'] = toss_df['Country 2'].str.split().str[0].str.strip()
# redo a similar operation on 'Country 1' column.
toss_df['Country 1'] = toss_df['Country 1'].str.split().str[-1].str.strip()
# since we retained only the last word of the 'Country 1' column, we need to rename the countries with multi-word names
toss_df['Country 1'].replace(to_replace = {'Zealand': 'New Zealand', 'Lanka': 'Sri Lanka', 'Africa': 'South Africa', 'Indies': 'West Indies', 'Emirates': 'United Arab Emirates', 'Kong': 'Hong Kong', 'Guinea': 'Papua New Guinea', 'States': 'United States'}, inplace = True)
# strip Ground name from the 'Ground' column (the last word or 2 basically) after creating its copy
toss_df.rename(columns = {'Ground': 'Ground_old'}, inplace = True)
toss_df['Ground'] = toss_df['Ground_old'].str.split().str[-1]
# what follows now is a a result of some manual trial and error to ensure the Ground names in our toss_df is the same as those in final_result
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Lord\'s, London'].index, 6] = 'Lord\'s'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Kennington Oval, London'].index, 6] = 'The Oval'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Buffalo Park, East London'].index, 6] = 'East London'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Albion Sports Complex, Albion, Berbice, Guyana'].index, 6] = 'Albion'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Bourda, Georgetown, Guyana'].index, 6] = 'Georgetown'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Providence Stadium, Providence, Guyana'].index, 6] = 'Providence'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Antigua Recreation Ground, St. John\'s, Antigua'].index, 6] = 'St John\'s'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sir Vivian Richards Stadium, North Sound, Antigua'].index, 6] = 'North Sound'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Sinhalese Sports Club Ground, Colombo'].index, 6] = 'Colombo (SSC)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'P Sara Oval, Colombo'].index, 6] = 'Colombo (PSS)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'R Premadasa Stadium, Colombo'].index, 6] = 'Colombo (RPS)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jawaharlal Nehru Stadium, New Delhi, Delhi'].index, 6] = 'New Delhi'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh'].index, 6] = 'Mohali'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Brabourne Stadium, Mumbai'].index, 6] = 'Mumbai (BS)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'The Village, Dublin'].index, 6] = 'Dublin (Malahide)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Dubai International Cricket Stadium, Dubai'].index, 6] = 'Dubai (DSC)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'ICC Global Cricket Academy, Dubai'].index, 6] = 'ICCA Dubai'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Gymkhana Club Ground, Nairobi'].index, 6] = 'Nairobi (Gym)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Nairobi Club Ground, Nairobi'].index, 6] = 'Nairobi (Club)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Aga Khan Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Aga)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Simba Union Ground, Nairobi'].index, 6] = 'Nairobi'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Jaffrey Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Jaff)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Ruaraka Sports Club Ground, Nairobi'].index, 6] = 'Nairobi (Ruaraka)'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Darren Sammy National Cricket Stadium, Gros Islet, St Lucia'].index, 6] = 'Gros Islet'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Mindoo Philip Park, Castries, St. Lucia'].index, 6] = 'Castries'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Pallekele International Cricket Stadium, Pallekele, Kandy'].index, 6] = 'Pallekele'
toss_df.iloc[toss_df[toss_df['Ground_old'] == 'Etihad Stadium, Melbourne'].index, 6] = 'Melbourne (Docklands)'
toss_df['Ground'].replace(to_replace = {'(Jullundur)': 'Jullundur', '(Sind)': 'Hyderabad (Sind)', 'Grenada': 'St George\'s', 'Wells': 'Tunbridge Wells', '(Deccan)': 'Hyderabad (Deccan)', 'Hyderabad': 'Hyderabad (Deccan)', '(Baroda)': 'Vadodara', 'Jamaica': 'Kingston', '(Lyallpur)': 'Faisalabad', 'Barbados': 'Bridgetown', '(Dacca)': 'Dhaka', 'Kitts': 'Basseterre', 'Lumpur': 'Kuala Lumpur', 'Ontario': 'King City (NW)', 'Dominica': 'Roseau', 'Voorburg': 'The Hague', 'Maunganui': 'Mount Maunganui', 'Noida': 'Greater Noida', 'Vincent': 'Kingstown', 'Dun': 'Dehradun', 'Bangalore': 'Bengaluru', 'Gujwranwala': 'Gujranwala', 'Trinidad': 'Port of Spain', 'Chittagong': 'Chattogram', 'Plymouth': 'New Plymouth', 'Town': 'Cape Town', 'Elizabeth': 'Port Elizabeth', 'Dhabi': 'Abu Dhabi', 'Chester-Le-Street': 'Chester-le-Street'}, inplace = True)
# remove all records that have 'Result:' in the 'Toss Won By' column
idx = toss_df[toss_df['Toss Won By'] == 'Result'].index
toss_df.drop(index = idx, inplace = True)
# sort by date
toss_df.sort_values(by = 'Match Date', ignore_index = True, inplace = True)
# next we will merge our toss_df with final_result after dropping the redundant columns
toss_df.drop(columns = ['Match Code', 'Ground_old', 'Country 1', 'Country 2'], inplace = True)
final_data = pd.merge(final_result, toss_df, how = 'left', left_on = ['Match Date', 'Ground'], right_on = ['Match Date', 'Ground'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment