Skip to content

Instantly share code, notes, and snippets.

@jilmun
Created February 20, 2021 21:22
Show Gist options
  • Save jilmun/b2abccdcbfe124da13b7189c2a447005 to your computer and use it in GitHub Desktop.
Save jilmun/b2abccdcbfe124da13b7189c2a447005 to your computer and use it in GitHub Desktop.
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
import time
from itertools import groupby
def scroll(driver, timeout):
# get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
for i in range(timeout):
# scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# wait for page to load
time.sleep(1)
# get new scroll height and compare to last height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If heights are the same it will exit the function
break
last_height = new_height
def scrapeleaders(leaderboard_url, leaderboard_name):
driver.get(leaderboard_url)
scroll(driver, 10) # increase this when ready to run
print("... Done scrolling!")
time.sleep(3)
if leaderboard_name == 'submission':
submissions_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[1]')
individuals_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[2]')
metrics_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[4]')
dates_l = driver.find_elements_by_xpath('//*[@id="submissions-div"]/tr/td[7]')
submissions = [i.text for i in submissions_l]
individuals = [i.text for i in individuals_l]
metrics = [i.text for i in metrics_l]
dates = [i.text for i in dates_l]
table_name = [leaderboard_name] * len(individuals)
df = pd.DataFrame(list(zip(table_name, submissions, individuals, metrics, dates)),
columns =['Leaderboard', 'Submission', 'Individual', 'Metric', 'Date'])
else: # url is for weekly profit or RMSE leaderboards
ranking_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[2]')
participants_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[3]')
if leaderboard_name == 'rmse':
metrics_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[4]')
submissions_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[8]/a')
else:
metrics_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[5]')
submissions_l = driver.find_elements_by_xpath('//*[@id="leaderboards-div"]/tr/td[9]/a')
ranking = [i.text for i in ranking_l]
participants = [i.text for i in participants_l]
metrics = [i.text for i in metrics_l]
submissions = [i.get_attribute('href') for i in submissions_l]
submissions = [sub[-6:] for sub in submissions] # get last 6-digit Submission ID
table_name = [leaderboard_name] * len(participants)
df = pd.DataFrame(list(zip(table_name, ranking, participants, metrics, submissions)),
columns =['Leaderboard', 'Rank', 'Participant', 'Metric', 'Submission'])
return df
# get the path of ChromeDriverServer
dir = os.path.dirname(__file__)
chrome_driver_path = dir + "\chromedriver.exe"
# create a new Chrome session
driver = webdriver.Chrome(executable_path = chrome_driver_path)
driver.implicitly_wait(30)
#driver.maximize_window()
df_submissions = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/submissions", "submission")
# df_rmse = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=467&challenge_round_id=625", "rmse")
df_week01 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=489&challenge_round_id=625", "week01")
df_week02 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=504&challenge_round_id=625", "week02")
df_week03 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=521&challenge_round_id=625", "week03")
df_week04 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=430&challenge_round_id=625", "week04")
df_week05 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=522&challenge_round_id=625", "week05")
df_week06 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_leaderboard_extra_id=524&challenge_round_id=625", "week06")
df_week07 = scrapeleaders("https://www.aicrowd.com/challenges/insurance-pricing-game/leaderboards?challenge_round_id=625", "week07")
print(len(df_submissions),
# len(df_rmse),
len(df_week01), len(df_week02), len(df_week03),
len(df_week04), len(df_week05), len(df_week06),
len(df_week07))
df_weeks = df_week01.append([df_week02, df_week03, df_week04, df_week05, df_week06, df_week07])
df_leaders = pd.merge(df_weeks,
df_submissions.drop(columns=['Leaderboard']),
on='Submission', how='left')
print(len(df_leaders))
df_leaders = df_leaders.rename(columns=
{"Metric_x": "Profit",
"Metric_y": "RMSE",
"Rank" : "Profit_Rank"})
# clean up dates
df_leaders[['Wkday_DMY', 'Submit_Time']] = df_leaders.Date.str.split("\n", expand=True)
df_leaders[['Submit_Weekday', "Submit_Date"]] = df_leaders.Wkday_DMY.str.split(",", expand=True)
df_leaders = df_leaders.drop(columns=['Date', 'Wkday_DMY'])
df_leaders['Submit_Date'] = pd.to_datetime(df_leaders['Submit_Date']).dt.date
# create RMSE rank column
df_leaders["RMSE"] = pd.to_numeric(df_leaders["RMSE"])
df_leaders["RMSE_Rank"] = df_leaders.groupby("Leaderboard")["RMSE"].rank("average", ascending=True)
# rearrange columns
df_leaders = df_leaders[['Leaderboard', 'Participant',
'Individual', 'Submission',
'RMSE', 'RMSE_Rank', 'Profit', 'Profit_Rank',
'Submit_Date', 'Submit_Weekday', 'Submit_Time']]
print(df_leaders.head())
df_leaders.to_csv('leaderboard_20210207.csv', index=False)
# close the browser window
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment