Skip to content

Instantly share code, notes, and snippets.

@marcelcaraciolo
Last active July 25, 2023 10:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcelcaraciolo/c932a20ecc409cb5b0f7967a60cbe4cf to your computer and use it in GitHub Desktop.
Save marcelcaraciolo/c932a20ecc409cb5b0f7967a60cbe4cf to your computer and use it in GitHub Desktop.
Scrapper from World Athletics Marathon World Rankings from Men and Women (2019-2023)
import json
import re
import grequests
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.diagnose import profile
URL_MEN_MARATHON = "https://worldathletics.org/world-rankings/marathon/men?regionType=world&page={}&rankDate={}&limitByCountry=0"
URL_WOMEN_MARATHON = "https://worldathletics.org/world-rankings/marathon/women?regionType=world&page={}&rankDate={}&limitByCountry=0"
URL_DETAILS_COMPETITOR = 'https://worldathletics.org/WorldRanking/RankingScoreCalculation?competitorId={}'
#Uncomment to select the dates to be extracted
#RANK_DATES = [{"value":"2023-06-27","label":"27 JUN 2023"}, {"value":"2023-06-20","label":"20 JUN 2023"},]
#RANK_DATES = [{"value":"2023-06-13","label":"13 JUN 2023"},{"value":"2023-06-06","label":"06 JUN 2023"},{"value":"2023-05-30","label":"30 MAY 2023"},{"value":"2023-05-23","label":"23 MAY 2023"},{"value":"2023-05-16","label":"16 MAY 2023"}]
#RANK_DATES = [{"value":"2023-05-09","label":"09 MAY 2023"},{"value":"2023-05-02","label":"02 MAY 2023"},{"value":"2023-04-25","label":"25 APR 2023"},{"value":"2023-04-18","label":"18 APR 2023"},{"value":"2023-04-11","label":"11 APR 2023"},{"value":"2023-04-04","label":"04 APR 2023"},{"value":"2023-03-28","label":"28 MAR 2023"},{"value":"2023-03-21","label":"21 MAR 2023"},{"value":"2023-03-14","label":"14 MAR 2023"},{"value":"2023-03-07","label":"07 MAR 2023"},{"value":"2023-02-28","label":"28 FEB 2023"},{"value":"2023-02-19","label":"19 FEB 2023"},{"value":"2023-02-14","label":"14 FEB 2023"},{"value":"2023-02-07","label":"07 FEB 2023"},{"value":"2023-01-31","label":"31 JAN 2023"},{"value":"2023-01-24","label":"24 JAN 2023"},{"value":"2023-01-17","label":"17 JAN 2023"},{"value":"2023-01-10","label":"10 JAN 2023"},{"value":"2023-01-03","label":"03 JAN 2023"},{"value":"2022-12-27","label":"27 DEC 2022"},{"value":"2022-12-20","label":"20 DEC 2022"},{"value":"2022-12-13","label":"13 DEC 2022"},{"value":"2022-12-06","label":"06 DEC 2022"},{"value":"2022-11-29","label":"29 NOV 2022"},{"value":"2022-11-22","label":"22 NOV 2022"},{"value":"2022-11-15","label":"15 NOV 2022"},{"value":"2022-11-08","label":"08 NOV 2022"},{"value":"2022-11-01","label":"01 NOV 2022"},{"value":"2022-10-25","label":"25 OCT 2022"},{"value":"2022-10-18","label":"18 OCT 2022"},{"value":"2022-10-11","label":"11 OCT 2022"},{"value":"2022-10-04","label":"04 OCT 2022"},{"value":"2022-09-27","label":"27 SEP 2022"},{"value":"2022-09-20","label":"20 SEP 2022"},{"value":"2022-09-13","label":"13 SEP 2022"},{"value":"2022-09-06","label":"06 SEP 2022"},{"value":"2022-08-30","label":"30 AUG 2022"},{"value":"2022-08-23","label":"23 AUG 2022"},{"value":"2022-08-16","label":"16 AUG 2022"},{"value":"2022-08-09","label":"09 AUG 2022"},{"value":"2022-08-02","label":"02 AUG 2022"},{"value":"2022-07-26","label":"26 JUL 2022"},{"value":"2022-07-11","label":"11 JUL 2022"},{"value":"2022-07-05","label":"05 JUL 2022"},{"value":"2022-06-26","label":"26 JUN 2022"},{"value":"2022-06-21","label":"21 JUN 2022"},{"value":"2022-06-14","label":"14 JUN 2022"},{"value":"2022-06-07","label":"07 JUN 2022"},{"value":"2022-05-29","label":"29 MAY 2022"},{"value":"2022-05-24","label":"24 MAY 2022"},{"value":"2022-05-17","label":"17 MAY 2022"},{"value":"2022-05-10","label":"10 MAY 2022"},{"value":"2022-05-03","label":"03 MAY 2022"},{"value":"2022-04-26","label":"26 APR 2022"},{"value":"2022-04-19","label":"19 APR 2022"},{"value":"2022-04-12","label":"12 APR 2022"},{"value":"2022-04-05","label":"05 APR 2022"},{"value":"2022-03-29","label":"29 MAR 2022"},{"value":"2022-03-22","label":"22 MAR 2022"},{"value":"2022-03-15","label":"15 MAR 2022"},{"value":"2022-03-07","label":"07 MAR 2022"},{"value":"2022-03-01","label":"01 MAR 2022"},{"value":"2022-02-22","label":"22 FEB 2022"},{"value":"2022-02-15","label":"15 FEB 2022"},{"value":"2022-02-08","label":"08 FEB 2022"},{"value":"2022-02-01","label":"01 FEB 2022"},{"value":"2022-01-25","label":"25 JAN 2022"},{"value":"2022-01-18","label":"18 JAN 2022"},{"value":"2022-01-11","label":"11 JAN 2022"},{"value":"2022-01-04","label":"04 JAN 2022"},{"value":"2021-12-28","label":"28 DEC 2021"},{"value":"2021-12-21","label":"21 DEC 2021"},{"value":"2021-12-14","label":"14 DEC 2021"},{"value":"2021-12-07","label":"07 DEC 2021"},{"value":"2021-11-30","label":"30 NOV 2021"},{"value":"2021-11-23","label":"23 NOV 2021"},{"value":"2021-11-16","label":"16 NOV 2021"},{"value":"2021-11-09","label":"09 NOV 2021"},{"value":"2021-11-02","label":"02 NOV 2021"},{"value":"2021-10-26","label":"26 OCT 2021"},{"value":"2021-10-19","label":"19 OCT 2021"},{"value":"2021-10-12","label":"12 OCT 2021"},{"value":"2021-10-05","label":"05 OCT 2021"},{"value":"2021-09-28","label":"28 SEP 2021"},{"value":"2021-09-21","label":"21 SEP 2021"},{"value":"2021-09-14","label":"14 SEP 2021"},{"value":"2021-09-07","label":"07 SEP 2021"},{"value":"2021-08-31","label":"31 AUG 2021"},{"value":"2021-08-24","label":"24 AUG 2021"},{"value":"2021-08-17","label":"17 AUG 2021"},{"value":"2021-08-10","label":"10 AUG 2021"},{"value":"2021-07-20","label":"20 JUL 2021"},{"value":"2021-07-13","label":"13 JUL 2021"},{"value":"2021-07-06","label":"06 JUL 2021"},{"value":"2021-06-29","label":"29 JUN 2021"},{"value":"2021-06-22","label":"22 JUN 2021"},{"value":"2021-06-15","label":"15 JUN 2021"},{"value":"2021-06-08","label":"08 JUN 2021"},{"value":"2021-05-31","label":"31 MAY 2021"},{"value":"2021-05-25","label":"25 MAY 2021"},{"value":"2021-05-18","label":"18 MAY 2021"},{"value":"2021-05-11","label":"11 MAY 2021"},{"value":"2021-05-04","label":"04 MAY 2021"},{"value":"2021-04-27","label":"27 APR 2021"},{"value":"2021-04-20","label":"20 APR 2021"},{"value":"2021-04-13","label":"13 APR 2021"},{"value":"2021-04-06","label":"06 APR 2021"},{"value":"2021-03-30","label":"30 MAR 2021"},{"value":"2021-03-23","label":"23 MAR 2021"},{"value":"2021-03-16","label":"16 MAR 2021"},{"value":"2021-03-09","label":"09 MAR 2021"},{"value":"2021-03-02","label":"02 MAR 2021"},{"value":"2021-02-23","label":"23 FEB 2021"},{"value":"2021-02-16","label":"16 FEB 2021"},{"value":"2021-02-09","label":"09 FEB 2021"},{"value":"2021-02-02","label":"02 FEB 2021"},{"value":"2021-01-26","label":"26 JAN 2021"},{"value":"2021-01-19","label":"19 JAN 2021"},{"value":"2021-01-12","label":"12 JAN 2021"},{"value":"2021-01-05","label":"05 JAN 2021"},{"value":"2020-12-29","label":"29 DEC 2020"},{"value":"2020-12-22","label":"22 DEC 2020"},{"value":"2020-12-15","label":"15 DEC 2020"},{"value":"2020-12-08","label":"08 DEC 2020"},{"value":"2020-12-01","label":"01 DEC 2020"},{"value":"2020-03-31","label":"31 MAR 2020"},{"value":"2020-03-24","label":"24 MAR 2020"},{"value":"2020-03-17","label":"17 MAR 2020"},]
#RANK_DATES = [{"value":"2020-03-10","label":"10 MAR 2020"},{"value":"2020-03-03","label":"03 MAR 2020"},{"value":"2020-02-25","label":"25 FEB 2020"},{"value":"2020-02-18","label":"18 FEB 2020"},{"value":"2020-02-11","label":"11 FEB 2020"},{"value":"2020-02-04","label":"04 FEB 2020"},]
RANK_DATES = [{"value":"2020-01-28","label":"28 JAN 2020"},{"value":"2020-01-21","label":"21 JAN 2020"},{"value":"2020-01-14","label":"14 JAN 2020"},{"value":"2020-01-07","label":"07 JAN 2020"},{"value":"2019-12-31","label":"31 DEC 2019"},{"value":"2019-12-24","label":"24 DEC 2019"},{"value":"2019-12-17","label":"17 DEC 2019"},{"value":"2019-12-10","label":"10 DEC 2019"},{"value":"2019-12-03","label":"03 DEC 2019"},{"value":"2019-11-26","label":"26 NOV 2019"},{"value":"2019-11-19","label":"19 NOV 2019"},{"value":"2019-11-12","label":"12 NOV 2019"},{"value":"2019-11-05","label":"05 NOV 2019"},{"value":"2019-10-29","label":"29 OCT 2019"},{"value":"2019-10-22","label":"22 OCT 2019"},{"value":"2019-10-15","label":"15 OCT 2019"},{"value":"2019-10-08","label":"08 OCT 2019"},{"value":"2019-09-17","label":"17 SEP 2019"},{"value":"2019-09-10","label":"10 SEP 2019"},{"value":"2019-09-03","label":"03 SEP 2019"},{"value":"2019-08-27","label":"27 AUG 2019"},{"value":"2019-08-20","label":"20 AUG 2019"},{"value":"2019-08-13","label":"13 AUG 2019"},{"value":"2019-08-06","label":"06 AUG 2019"},{"value":"2019-07-30","label":"30 JUL 2019"},{"value":"2019-07-23","label":"23 JUL 2019"},{"value":"2019-07-16","label":"16 JUL 2019"},{"value":"2019-07-09","label":"09 JUL 2019"},{"value":"2019-07-02","label":"02 JUL 2019"},{"value":"2019-06-25","label":"25 JUN 2019"},{"value":"2019-06-18","label":"18 JUN 2019"},{"value":"2019-06-11","label":"11 JUN 2019"},{"value":"2019-06-04","label":"04 JUN 2019"},{"value":"2019-05-28","label":"28 MAY 2019"},{"value":"2019-05-21","label":"21 MAY 2019"},{"value":"2019-05-14","label":"14 MAY 2019"},{"value":"2019-05-07","label":"07 MAY 2019"},{"value":"2019-04-30","label":"30 APR 2019"},{"value":"2019-04-23","label":"23 APR 2019"},{"value":"2019-04-16","label":"16 APR 2019"},{"value":"2019-04-09","label":"09 APR 2019"},{"value":"2019-04-02","label":"02 APR 2019"},{"value":"2019-03-26","label":"26 MAR 2019"},{"value":"2019-03-19","label":"19 MAR 2019"},{"value":"2019-03-12","label":"12 MAR 2019"},{"value":"2019-03-05","label":"05 MAR 2019"},{"value":"2019-02-26","label":"26 FEB 2019"},{"value":"2019-02-19","label":"19 FEB 2019"},{"value":"2019-02-12","label":"12 FEB 2019"},{"value":"2019-02-05","label":"05 FEB 2019"},{"value":"2019-01-29","label":"29 JAN 2019"},{"value":"2019-01-22","label":"22 JAN 2019"},{"value":"2019-01-15","label":"15 JAN 2019"},{"value":"2019-01-08","label":"08 JAN 2019"},{"value":"2019-01-01","label":"01 JAN 2019"}]
CACHET_BANK = {}
def get_pages(url, rankdate):
url = url.format('1', rankdate)
content = requests.get(url).text
soup = BeautifulSoup(content, 'html.parser')
result_set = soup.find_all('a', {'class': re.compile('btn--pagination')})
all_pages = [ result['data-page'] for result in result_set]
return all_pages
def extract_row(result):
row_parsed = {}
row_parsed['rank'] = int(result.find('td', {'data-th': 'Rank'}).text.strip())
row_parsed['competitor'] = result.find('td', {'data-th': 'Competitor'}).text.strip()
row_parsed['dob'] = result.find('td', {'data-th': 'DOB'}).text.strip()
row_parsed['nat'] = result.find('td', {'data-th': 'Nat'}).text.strip()
row_parsed['score'] = result.find('td', {'data-th': 'score'}).text.strip()
row_parsed['events'] = result.find('td', {'data-th': 'EventList'}).text.strip()
return row_parsed
def get_records(content):
soup = BeautifulSoup(content, 'html.parser')
result_set = soup.find_all('tr', {'data-athlete-url': re.compile('athletes')})
parsed_rows = []
parsed_results = []
parsed_data_ids = []
for result in result_set:
row = extract_row(result)
row['competitor_id'] = result['data-athlete-url'].replace('/athletes/', '')
profile_results = CACHET_BANK.get(row['competitor_id'] + '__' + row['score'], result['data-id'])
if isinstance(profile_results, list):
parsed_results.extend(profile_results)
else:
parsed_data_ids.append((row['competitor_id'] + '__' + row['score'], result['data-id']))
parsed_rows.append(row)
return parsed_rows, parsed_results, parsed_data_ids
def extract_race_results(content):
content = json.loads(content)
athlete_profile = []
for result in content['results']:
athlete_result = {}
athlete_result['athtlete_slug'] = content['athleteUrlSlug']
athlete_result['race_date'] = result['date']
athlete_result['place'] = result['place'].replace('.', '')
athlete_result['competition'] = result['competition']
athlete_result['mark'] = result['mark']
athlete_result['disciplineCode'] = result['disciplineCode']
athlete_profile.append(athlete_result)
return athlete_profile
if __name__ == '__main__':
URLS = [('F', URL_WOMEN_MARATHON)]#[('M', URL_MEN_MARATHON) ] #[('F', URL_WOMEN_MARATHON),('M', URL_MEN_MARATHON)]
for sex, url in URLS:
race_frames = []
for rankdate in RANK_DATES:
result_rank_frames = []
result_race_frames = []
profile_set =[]
group_data_ids = []
pages = get_pages(url, rankdate['value'])
reqs = [grequests.get(url.format(page,rankdate['value'])) for page in set(pages)]
for resp in grequests.imap(reqs, size=10):
result_page, profile_results, parsed_data_ids = get_records(resp.text)
df = pd.DataFrame(result_page)
df['sex'] = sex
df['rank_date'] = rankdate['value']
result_rank_frames.append(df)
profile_set.extend(profile_results)
group_data_ids.extend(parsed_data_ids)
df3 = pd.DataFrame(profile_set)
result_race_frames.append(df3)
reqs_prof = [grequests.get(URL_DETAILS_COMPETITOR.format(data_id)) for _, data_id in group_data_ids]
for index, resp in grequests.imap_enumerated(reqs_prof, size=10):
profile_page = extract_race_results(resp.json())
CACHET_BANK[group_data_ids[index][0]] = profile_page
df2 = pd.DataFrame(profile_page)
result_race_frames.append(df2)
concat_df = pd.concat(result_rank_frames)
concat_df2 = pd.concat(result_race_frames)
sorted_df = concat_df.sort_values(by=['rank'], ascending=True)
race_frames.append(concat_df2)
sorted_df.to_csv(rankdate['value'] + '_%s_WORLDATHLETICS_MARATHON_RANKINGS.csv' % sex, index=False)
unique_results = concat_df2.drop_duplicates()
unique_results.to_csv(rankdate['value'] + '_%s_WORLDATHLETICS_MARATHON_RESULTS.csv' % sex, index=False)
#concat_results_df = pd.concat(race_frames)
#unique_results.to_csv('GENERAL_%s_WORLDATHLETICS_MARATHON_RESULTS.csv' % sex, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment