Skip to content

Instantly share code, notes, and snippets.

@nonsleepr
Created September 24, 2014 15:42
Show Gist options
  • Save nonsleepr/146e247f40ffb0ce3281 to your computer and use it in GitHub Desktop.
Save nonsleepr/146e247f40ffb0ce3281 to your computer and use it in GitHub Desktop.
Set of functions to download datasets from Kaggle
### Set of functions to download datasets from Kaggle
# Usage:
#
# > train <- kaggle_get_file("train.csv",
# + competition.name = "titanic-gettingStarted",
# + username = "username",
# + password = "password")
#
# > head(train, n = 2)
#
# PassengerId Survived Pclass Name Sex Age SibSp Parch
# 1 1 0 3 Braund, Mr. Owen Harris male 22 1 0
# 2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
# Ticket Fare Cabin Embarked
# 1 A/5 21171 7.2500 S
# 2 PC 17599 71.2833 C85 C
#
# > cat(attr(train,"desc"))
#
# VARIABLE DESCRIPTIONS:
# survival Survival
# (0 = No; 1 = Yes)
# pclass Passenger Class
# (1 = 1st; 2 = 2nd; 3 = 3rd)
# name Name
# ...
#
require("rvest")
require("magrittr")
require("plyr")
BASE.URL <- "https://www.kaggle.com"
kaggle_login <- function(username, password) {
login.url <- paste0(BASE.URL, "/account/login")
session <- html_session(login.url)
frm <- html_form(session)[[2]] %>%
set_values(UserName = username, Password = password)
frm$url <- paste0(BASE.URL, frm$url)
submit_form(session,frm)
return(session)
}
kaggle_data <- function(competition.name, username = NA, password = NA, session = NA) {
data.url <- paste0(BASE.URL,"/c/",competition.name,"/data")
if (is.na(session)) {
session <- kaggle_login(username, password)
}
df.page <- jump_to(session, data.url)
data.files <- df.page %>%
html_nodes("#data-files") %>%
html_nodes("a") %>%
html_attrs() %>%
ldply()
data.files$href <- paste0(BASE.URL,data.files$href)
desc <- df.page %>%
html_nodes(".cms-page > pre") %>%
html_text()
return(structure(data.files,
desc = desc,
session = session))
}
kaggle_get_file <- function(file.name = "train.csv",
competition.name = NA,
username = NA,
password = NA,
session = NA,
kaggle.data = NA) {
if (is.na(kaggle.data)) {
kaggle.data <- kaggle_data(competition.name, username, password, session)
}
session <- attr(kaggle.data,"session")
train.url <- subset(kaggle.data, name == file.name)$href[1]
df <- httr::content(jump_to(session, train.url)$response)
return(structure(df,
session = session,
desc = attr(kaggle.data,"desc")))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment