Created
March 7, 2018 20:14
-
-
Save deargle/12654ac7734c418b75a1963e09dd11ea to your computer and use it in GitHub Desktop.
A script to pull kaggle scores for the titanic competition and select only the top three
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This script downloads the public leaderboard data for a competition, | |
filters down to just the highest-scoring submissions for each | |
team with "Eargle" in the name, and saves the top-three scores | |
to a .csv in the current directory called `top-three <date-timestamp>.csv` | |
If you want to see the highest submission for _each_ team, just comment out | |
line 40 (put a `#` before it). | |
To run this, get python if you don't already have it, and also `pip` if you don't already have it, | |
and install the needed packages if the script complains when you try to run it. e.g., `pip install pandas` | |
''' | |
import zipfile | |
import pandas as pd | |
import requests | |
#download the zip | |
username = '' # set your kaggle username here | |
password = '' # set your kaggle password here | |
competition_id='3136' #the code for titanic | |
read_me = 'titanic-publicleaderboard.csv' | |
is_higher_score_better = True # For Titanic, higher scores are better | |
session = requests.Session() | |
get_me = 'https://www.kaggle.com/account/login?ReturnUrl=%2fc%2f' + competition_id + '%2fpublicleaderboarddata.zip' | |
response = session.get(get_me) #call it once to get the request verification cookie | |
payload = {'username':username,'password':password,'__RequestVerificationToken':session.cookies.get('__RequestVerificationToken')} | |
r = session.post(get_me, data=payload) | |
with open('the-zip.zip', 'wb') as f: | |
f.write(r.content) | |
#extract zip file (must be placed in current directory) | |
path_to_zip_file = "the-zip.zip" | |
zip_ref = zipfile.ZipFile(path_to_zip_file, 'r') | |
zip_ref.extractall('.') | |
zip_ref.close() | |
# filter dat | |
df = pd.read_csv(read_me) | |
eargle_teams = df[df['TeamName'].str.contains('Eargle', case=False)] # select only those from my class | |
eargle_teams = eargle_teams[~eargle_teams['TeamName'].str.contains('999')] # remove my own submissions | |
eargle_teams_highest_for_each_team = eargle_teams.groupby('TeamName').apply(lambda g: g[g['Score'] == g['Score'].max()]).reset_index(drop=True) | |
eargle_teams_highest_first = eargle_teams_highest_for_each_team.sort_values(by=['Score', 'SubmissionDate'], ascending=[not is_higher_score_better, 1]) | |
eargle_teams_highest_first = eargle_teams_highest_first.head(3) | |
for_csv = eargle_teams_highest_first[['TeamName', 'SubmissionDate', 'Score']] | |
for_csv.to_csv('top-three %s.csv' % datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S'), index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment