Last active
July 25, 2023 10:00
-
-
Save marcelcaraciolo/c932a20ecc409cb5b0f7967a60cbe4cf to your computer and use it in GitHub Desktop.
Scrapper from World Athletics Marathon World Rankings from Men and Women (2019-2023)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
import grequests | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from bs4.diagnose import profile | |
URL_MEN_MARATHON = "https://worldathletics.org/world-rankings/marathon/men?regionType=world&page={}&rankDate={}&limitByCountry=0" | |
URL_WOMEN_MARATHON = "https://worldathletics.org/world-rankings/marathon/women?regionType=world&page={}&rankDate={}&limitByCountry=0" | |
URL_DETAILS_COMPETITOR = 'https://worldathletics.org/WorldRanking/RankingScoreCalculation?competitorId={}' | |
#Uncomment to select the dates to be extracted | |
#RANK_DATES = [{"value":"2023-06-27","label":"27 JUN 2023"}, {"value":"2023-06-20","label":"20 JUN 2023"},] | |
#RANK_DATES = [{"value":"2023-06-13","label":"13 JUN 2023"},{"value":"2023-06-06","label":"06 JUN 2023"},{"value":"2023-05-30","label":"30 MAY 2023"},{"value":"2023-05-23","label":"23 MAY 2023"},{"value":"2023-05-16","label":"16 MAY 2023"}] | |
#RANK_DATES = [{"value":"2023-05-09","label":"09 MAY 2023"},{"value":"2023-05-02","label":"02 MAY 2023"},{"value":"2023-04-25","label":"25 APR 2023"},{"value":"2023-04-18","label":"18 APR 2023"},{"value":"2023-04-11","label":"11 APR 2023"},{"value":"2023-04-04","label":"04 APR 2023"},{"value":"2023-03-28","label":"28 MAR 2023"},{"value":"2023-03-21","label":"21 MAR 2023"},{"value":"2023-03-14","label":"14 MAR 2023"},{"value":"2023-03-07","label":"07 MAR 2023"},{"value":"2023-02-28","label":"28 FEB 2023"},{"value":"2023-02-19","label":"19 FEB 2023"},{"value":"2023-02-14","label":"14 FEB 2023"},{"value":"2023-02-07","label":"07 FEB 2023"},{"value":"2023-01-31","label":"31 JAN 2023"},{"value":"2023-01-24","label":"24 JAN 2023"},{"value":"2023-01-17","label":"17 JAN 2023"},{"value":"2023-01-10","label":"10 JAN 2023"},{"value":"2023-01-03","label":"03 JAN 2023"},{"value":"2022-12-27","label":"27 DEC 2022"},{"value":"2022-12-20","label":"20 DEC 2022"},{"value":"2022-12-13","label":"13 DEC 2022"},{"value":"2022-12-06","label":"06 DEC 2022"},{"value":"2022-11-29","label":"29 NOV 2022"},{"value":"2022-11-22","label":"22 NOV 2022"},{"value":"2022-11-15","label":"15 NOV 2022"},{"value":"2022-11-08","label":"08 NOV 2022"},{"value":"2022-11-01","label":"01 NOV 2022"},{"value":"2022-10-25","label":"25 OCT 2022"},{"value":"2022-10-18","label":"18 OCT 2022"},{"value":"2022-10-11","label":"11 OCT 2022"},{"value":"2022-10-04","label":"04 OCT 2022"},{"value":"2022-09-27","label":"27 SEP 2022"},{"value":"2022-09-20","label":"20 SEP 2022"},{"value":"2022-09-13","label":"13 SEP 2022"},{"value":"2022-09-06","label":"06 SEP 2022"},{"value":"2022-08-30","label":"30 AUG 2022"},{"value":"2022-08-23","label":"23 AUG 2022"},{"value":"2022-08-16","label":"16 AUG 2022"},{"value":"2022-08-09","label":"09 AUG 2022"},{"value":"2022-08-02","label":"02 AUG 2022"},{"value":"2022-07-26","label":"26 JUL 2022"},{"value":"2022-07-11","label":"11 JUL 2022"},{"value":"2022-07-05","label":"05 JUL 2022"},{"value":"2022-06-26","label":"26 JUN 2022"},{"value":"2022-06-21","label":"21 JUN 2022"},{"value":"2022-06-14","label":"14 JUN 2022"},{"value":"2022-06-07","label":"07 JUN 2022"},{"value":"2022-05-29","label":"29 MAY 2022"},{"value":"2022-05-24","label":"24 MAY 2022"},{"value":"2022-05-17","label":"17 MAY 2022"},{"value":"2022-05-10","label":"10 MAY 2022"},{"value":"2022-05-03","label":"03 MAY 2022"},{"value":"2022-04-26","label":"26 APR 2022"},{"value":"2022-04-19","label":"19 APR 2022"},{"value":"2022-04-12","label":"12 APR 2022"},{"value":"2022-04-05","label":"05 APR 2022"},{"value":"2022-03-29","label":"29 MAR 2022"},{"value":"2022-03-22","label":"22 MAR 2022"},{"value":"2022-03-15","label":"15 MAR 2022"},{"value":"2022-03-07","label":"07 MAR 2022"},{"value":"2022-03-01","label":"01 MAR 2022"},{"value":"2022-02-22","label":"22 FEB 2022"},{"value":"2022-02-15","label":"15 FEB 2022"},{"value":"2022-02-08","label":"08 FEB 2022"},{"value":"2022-02-01","label":"01 FEB 2022"},{"value":"2022-01-25","label":"25 JAN 2022"},{"value":"2022-01-18","label":"18 JAN 2022"},{"value":"2022-01-11","label":"11 JAN 2022"},{"value":"2022-01-04","label":"04 JAN 2022"},{"value":"2021-12-28","label":"28 DEC 2021"},{"value":"2021-12-21","label":"21 DEC 2021"},{"value":"2021-12-14","label":"14 DEC 2021"},{"value":"2021-12-07","label":"07 DEC 2021"},{"value":"2021-11-30","label":"30 NOV 2021"},{"value":"2021-11-23","label":"23 NOV 2021"},{"value":"2021-11-16","label":"16 NOV 2021"},{"value":"2021-11-09","label":"09 NOV 2021"},{"value":"2021-11-02","label":"02 NOV 2021"},{"value":"2021-10-26","label":"26 OCT 2021"},{"value":"2021-10-19","label":"19 OCT 2021"},{"value":"2021-10-12","label":"12 OCT 2021"},{"value":"2021-10-05","label":"05 OCT 2021"},{"value":"2021-09-28","label":"28 SEP 2021"},{"value":"2021-09-21","label":"21 SEP 2021"},{"value":"2021-09-14","label":"14 SEP 2021"},{"value":"2021-09-07","label":"07 SEP 2021"},{"value":"2021-08-31","label":"31 AUG 2021"},{"value":"2021-08-24","label":"24 AUG 2021"},{"value":"2021-08-17","label":"17 AUG 2021"},{"value":"2021-08-10","label":"10 AUG 2021"},{"value":"2021-07-20","label":"20 JUL 2021"},{"value":"2021-07-13","label":"13 JUL 2021"},{"value":"2021-07-06","label":"06 JUL 2021"},{"value":"2021-06-29","label":"29 JUN 2021"},{"value":"2021-06-22","label":"22 JUN 2021"},{"value":"2021-06-15","label":"15 JUN 2021"},{"value":"2021-06-08","label":"08 JUN 2021"},{"value":"2021-05-31","label":"31 MAY 2021"},{"value":"2021-05-25","label":"25 MAY 2021"},{"value":"2021-05-18","label":"18 MAY 2021"},{"value":"2021-05-11","label":"11 MAY 2021"},{"value":"2021-05-04","label":"04 MAY 2021"},{"value":"2021-04-27","label":"27 APR 2021"},{"value":"2021-04-20","label":"20 APR 2021"},{"value":"2021-04-13","label":"13 APR 2021"},{"value":"2021-04-06","label":"06 APR 2021"},{"value":"2021-03-30","label":"30 MAR 2021"},{"value":"2021-03-23","label":"23 MAR 2021"},{"value":"2021-03-16","label":"16 MAR 2021"},{"value":"2021-03-09","label":"09 MAR 2021"},{"value":"2021-03-02","label":"02 MAR 2021"},{"value":"2021-02-23","label":"23 FEB 2021"},{"value":"2021-02-16","label":"16 FEB 2021"},{"value":"2021-02-09","label":"09 FEB 2021"},{"value":"2021-02-02","label":"02 FEB 2021"},{"value":"2021-01-26","label":"26 JAN 2021"},{"value":"2021-01-19","label":"19 JAN 2021"},{"value":"2021-01-12","label":"12 JAN 2021"},{"value":"2021-01-05","label":"05 JAN 2021"},{"value":"2020-12-29","label":"29 DEC 2020"},{"value":"2020-12-22","label":"22 DEC 2020"},{"value":"2020-12-15","label":"15 DEC 2020"},{"value":"2020-12-08","label":"08 DEC 2020"},{"value":"2020-12-01","label":"01 DEC 2020"},{"value":"2020-03-31","label":"31 MAR 2020"},{"value":"2020-03-24","label":"24 MAR 2020"},{"value":"2020-03-17","label":"17 MAR 2020"},] | |
#RANK_DATES = [{"value":"2020-03-10","label":"10 MAR 2020"},{"value":"2020-03-03","label":"03 MAR 2020"},{"value":"2020-02-25","label":"25 FEB 2020"},{"value":"2020-02-18","label":"18 FEB 2020"},{"value":"2020-02-11","label":"11 FEB 2020"},{"value":"2020-02-04","label":"04 FEB 2020"},] | |
RANK_DATES = [{"value":"2020-01-28","label":"28 JAN 2020"},{"value":"2020-01-21","label":"21 JAN 2020"},{"value":"2020-01-14","label":"14 JAN 2020"},{"value":"2020-01-07","label":"07 JAN 2020"},{"value":"2019-12-31","label":"31 DEC 2019"},{"value":"2019-12-24","label":"24 DEC 2019"},{"value":"2019-12-17","label":"17 DEC 2019"},{"value":"2019-12-10","label":"10 DEC 2019"},{"value":"2019-12-03","label":"03 DEC 2019"},{"value":"2019-11-26","label":"26 NOV 2019"},{"value":"2019-11-19","label":"19 NOV 2019"},{"value":"2019-11-12","label":"12 NOV 2019"},{"value":"2019-11-05","label":"05 NOV 2019"},{"value":"2019-10-29","label":"29 OCT 2019"},{"value":"2019-10-22","label":"22 OCT 2019"},{"value":"2019-10-15","label":"15 OCT 2019"},{"value":"2019-10-08","label":"08 OCT 2019"},{"value":"2019-09-17","label":"17 SEP 2019"},{"value":"2019-09-10","label":"10 SEP 2019"},{"value":"2019-09-03","label":"03 SEP 2019"},{"value":"2019-08-27","label":"27 AUG 2019"},{"value":"2019-08-20","label":"20 AUG 2019"},{"value":"2019-08-13","label":"13 AUG 2019"},{"value":"2019-08-06","label":"06 AUG 2019"},{"value":"2019-07-30","label":"30 JUL 2019"},{"value":"2019-07-23","label":"23 JUL 2019"},{"value":"2019-07-16","label":"16 JUL 2019"},{"value":"2019-07-09","label":"09 JUL 2019"},{"value":"2019-07-02","label":"02 JUL 2019"},{"value":"2019-06-25","label":"25 JUN 2019"},{"value":"2019-06-18","label":"18 JUN 2019"},{"value":"2019-06-11","label":"11 JUN 2019"},{"value":"2019-06-04","label":"04 JUN 2019"},{"value":"2019-05-28","label":"28 MAY 2019"},{"value":"2019-05-21","label":"21 MAY 2019"},{"value":"2019-05-14","label":"14 MAY 2019"},{"value":"2019-05-07","label":"07 MAY 2019"},{"value":"2019-04-30","label":"30 APR 2019"},{"value":"2019-04-23","label":"23 APR 2019"},{"value":"2019-04-16","label":"16 APR 2019"},{"value":"2019-04-09","label":"09 APR 2019"},{"value":"2019-04-02","label":"02 APR 2019"},{"value":"2019-03-26","label":"26 MAR 2019"},{"value":"2019-03-19","label":"19 MAR 2019"},{"value":"2019-03-12","label":"12 MAR 2019"},{"value":"2019-03-05","label":"05 MAR 2019"},{"value":"2019-02-26","label":"26 FEB 2019"},{"value":"2019-02-19","label":"19 FEB 2019"},{"value":"2019-02-12","label":"12 FEB 2019"},{"value":"2019-02-05","label":"05 FEB 2019"},{"value":"2019-01-29","label":"29 JAN 2019"},{"value":"2019-01-22","label":"22 JAN 2019"},{"value":"2019-01-15","label":"15 JAN 2019"},{"value":"2019-01-08","label":"08 JAN 2019"},{"value":"2019-01-01","label":"01 JAN 2019"}] | |
CACHET_BANK = {} | |
def get_pages(url, rankdate): | |
url = url.format('1', rankdate) | |
content = requests.get(url).text | |
soup = BeautifulSoup(content, 'html.parser') | |
result_set = soup.find_all('a', {'class': re.compile('btn--pagination')}) | |
all_pages = [ result['data-page'] for result in result_set] | |
return all_pages | |
def extract_row(result): | |
row_parsed = {} | |
row_parsed['rank'] = int(result.find('td', {'data-th': 'Rank'}).text.strip()) | |
row_parsed['competitor'] = result.find('td', {'data-th': 'Competitor'}).text.strip() | |
row_parsed['dob'] = result.find('td', {'data-th': 'DOB'}).text.strip() | |
row_parsed['nat'] = result.find('td', {'data-th': 'Nat'}).text.strip() | |
row_parsed['score'] = result.find('td', {'data-th': 'score'}).text.strip() | |
row_parsed['events'] = result.find('td', {'data-th': 'EventList'}).text.strip() | |
return row_parsed | |
def get_records(content): | |
soup = BeautifulSoup(content, 'html.parser') | |
result_set = soup.find_all('tr', {'data-athlete-url': re.compile('athletes')}) | |
parsed_rows = [] | |
parsed_results = [] | |
parsed_data_ids = [] | |
for result in result_set: | |
row = extract_row(result) | |
row['competitor_id'] = result['data-athlete-url'].replace('/athletes/', '') | |
profile_results = CACHET_BANK.get(row['competitor_id'] + '__' + row['score'], result['data-id']) | |
if isinstance(profile_results, list): | |
parsed_results.extend(profile_results) | |
else: | |
parsed_data_ids.append((row['competitor_id'] + '__' + row['score'], result['data-id'])) | |
parsed_rows.append(row) | |
return parsed_rows, parsed_results, parsed_data_ids | |
def extract_race_results(content): | |
content = json.loads(content) | |
athlete_profile = [] | |
for result in content['results']: | |
athlete_result = {} | |
athlete_result['athtlete_slug'] = content['athleteUrlSlug'] | |
athlete_result['race_date'] = result['date'] | |
athlete_result['place'] = result['place'].replace('.', '') | |
athlete_result['competition'] = result['competition'] | |
athlete_result['mark'] = result['mark'] | |
athlete_result['disciplineCode'] = result['disciplineCode'] | |
athlete_profile.append(athlete_result) | |
return athlete_profile | |
if __name__ == '__main__': | |
URLS = [('F', URL_WOMEN_MARATHON)]#[('M', URL_MEN_MARATHON) ] #[('F', URL_WOMEN_MARATHON),('M', URL_MEN_MARATHON)] | |
for sex, url in URLS: | |
race_frames = [] | |
for rankdate in RANK_DATES: | |
result_rank_frames = [] | |
result_race_frames = [] | |
profile_set =[] | |
group_data_ids = [] | |
pages = get_pages(url, rankdate['value']) | |
reqs = [grequests.get(url.format(page,rankdate['value'])) for page in set(pages)] | |
for resp in grequests.imap(reqs, size=10): | |
result_page, profile_results, parsed_data_ids = get_records(resp.text) | |
df = pd.DataFrame(result_page) | |
df['sex'] = sex | |
df['rank_date'] = rankdate['value'] | |
result_rank_frames.append(df) | |
profile_set.extend(profile_results) | |
group_data_ids.extend(parsed_data_ids) | |
df3 = pd.DataFrame(profile_set) | |
result_race_frames.append(df3) | |
reqs_prof = [grequests.get(URL_DETAILS_COMPETITOR.format(data_id)) for _, data_id in group_data_ids] | |
for index, resp in grequests.imap_enumerated(reqs_prof, size=10): | |
profile_page = extract_race_results(resp.json()) | |
CACHET_BANK[group_data_ids[index][0]] = profile_page | |
df2 = pd.DataFrame(profile_page) | |
result_race_frames.append(df2) | |
concat_df = pd.concat(result_rank_frames) | |
concat_df2 = pd.concat(result_race_frames) | |
sorted_df = concat_df.sort_values(by=['rank'], ascending=True) | |
race_frames.append(concat_df2) | |
sorted_df.to_csv(rankdate['value'] + '_%s_WORLDATHLETICS_MARATHON_RANKINGS.csv' % sex, index=False) | |
unique_results = concat_df2.drop_duplicates() | |
unique_results.to_csv(rankdate['value'] + '_%s_WORLDATHLETICS_MARATHON_RESULTS.csv' % sex, index=False) | |
#concat_results_df = pd.concat(race_frames) | |
#unique_results.to_csv('GENERAL_%s_WORLDATHLETICS_MARATHON_RESULTS.csv' % sex, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment