Created
September 24, 2014 15:42
-
-
Save nonsleepr/146e247f40ffb0ce3281 to your computer and use it in GitHub Desktop.
Set of functions to download datasets from Kaggle
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Set of functions to download datasets from Kaggle | |
# Usage: | |
# | |
# > train <- kaggle_get_file("train.csv", | |
# + competition.name = "titanic-gettingStarted", | |
# + username = "username", | |
# + password = "password") | |
# | |
# > head(train, n = 2) | |
# | |
# PassengerId Survived Pclass Name Sex Age SibSp Parch | |
# 1 1 0 3 Braund, Mr. Owen Harris male 22 1 0 | |
# 2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0 | |
# Ticket Fare Cabin Embarked | |
# 1 A/5 21171 7.2500 S | |
# 2 PC 17599 71.2833 C85 C | |
# | |
# > cat(attr(train,"desc")) | |
# | |
# VARIABLE DESCRIPTIONS: | |
# survival Survival | |
# (0 = No; 1 = Yes) | |
# pclass Passenger Class | |
# (1 = 1st; 2 = 2nd; 3 = 3rd) | |
# name Name | |
# ... | |
# | |
require("rvest") | |
require("magrittr") | |
require("plyr") | |
BASE.URL <- "https://www.kaggle.com" | |
kaggle_login <- function(username, password) { | |
login.url <- paste0(BASE.URL, "/account/login") | |
session <- html_session(login.url) | |
frm <- html_form(session)[[2]] %>% | |
set_values(UserName = username, Password = password) | |
frm$url <- paste0(BASE.URL, frm$url) | |
submit_form(session,frm) | |
return(session) | |
} | |
kaggle_data <- function(competition.name, username = NA, password = NA, session = NA) { | |
data.url <- paste0(BASE.URL,"/c/",competition.name,"/data") | |
if (is.na(session)) { | |
session <- kaggle_login(username, password) | |
} | |
df.page <- jump_to(session, data.url) | |
data.files <- df.page %>% | |
html_nodes("#data-files") %>% | |
html_nodes("a") %>% | |
html_attrs() %>% | |
ldply() | |
data.files$href <- paste0(BASE.URL,data.files$href) | |
desc <- df.page %>% | |
html_nodes(".cms-page > pre") %>% | |
html_text() | |
return(structure(data.files, | |
desc = desc, | |
session = session)) | |
} | |
kaggle_get_file <- function(file.name = "train.csv", | |
competition.name = NA, | |
username = NA, | |
password = NA, | |
session = NA, | |
kaggle.data = NA) { | |
if (is.na(kaggle.data)) { | |
kaggle.data <- kaggle_data(competition.name, username, password, session) | |
} | |
session <- attr(kaggle.data,"session") | |
train.url <- subset(kaggle.data, name == file.name)$href[1] | |
df <- httr::content(jump_to(session, train.url)$response) | |
return(structure(df, | |
session = session, | |
desc = attr(kaggle.data,"desc"))) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment