Skip to content

Instantly share code, notes, and snippets.

@tts
Last active August 29, 2015 14:06
Show Gist options
  • Save tts/38909f047185e4c306aa to your computer and use it in GitHub Desktop.
Save tts/38909f047185e4c306aa to your computer and use it in GitHub Desktop.
Kilometrikisa results data
####################################
#
# Example of how to follow a link.
# The location (community) info of
# the teams is on a separate page.
#
# TODO: collect info of all teams
#
#####################################
url <- "http://www.kilometrikisa.fi/teams"
s <- html_session(url)
community <- s %>%
follow_link("11. kerroksen polkijat (EK)") %>%
html_nodes(xpath = "//i[@class='icon-home']/..") %>%
html_text(trim = TRUE) %>%
gsub("Kunta:\n\t\t\t\t\t", "", .)
###################################################################
#
# Data from Kilometrikisa results
# http://www.kilometrikisa.fi/
#
# 24.9.2014
# Tuija Sonkkila
#
# If you don't need the location/community info of the teams,
# the rest of the data is here:
# https://www.dropbox.com/s/ljxeypkihza8wt6/kmkisadata.csv?dl=0
#
####################################################################
library(rvest)
library(dplyr)
resultslist <- vector("list", 2438)
# 49 pages
for (i in 1:49) {
baseurl <- "http://www.kilometrikisa.fi/teams/?page="
url <- paste0(baseurl, i)
table <- url %>%
html(encoding = "UTF-8") %>%
html_node("table")
page <- i
rows <- table %>%
html_nodes(xpath = "//tbody//tr")
pos <- rows %>%
html_node(xpath = "td[1]") %>%
html_text(trim = TRUE) %>%
as.integer()
team <- rows %>%
html_node(xpath = "td[2]/a") %>%
html_text()
persons <- rows %>%
html_node(xpath = "td[2]/span") %>%
html_text(trim = TRUE) %>%
gsub("[()]", "", .) %>%
as.integer()
kmpp <- rows %>%
html_node(xpath = "td[3]") %>%
html_text() %>%
gsub(",", ".", .) %>%
as.numeric()
kmsum <- rows %>%
html_node(xpath = "td[4]") %>%
html_text() %>%
gsub(",", ".", .) %>%
as.numeric()
daysapp <- rows %>%
html_node(xpath = "td[5]") %>%
html_text(trim = TRUE) %>%
gsub(",", ".", .) %>%
as.numeric()
gas <- rows %>%
html_node(xpath = "td[6]") %>%
html_text() %>%
as.integer()
co2 <- rows %>%
html_node(xpath = "td[7]") %>%
html_text() %>%
as.integer()
donation <- rows %>%
html_node(xpath = "td[8]") %>%
html_text(trim = TRUE)
df <- data.frame(page, pos, team, persons, kmpp, kmsum, daysapp, gas, co2, donation,
stringsAsFactors = FALSE)
# http://adv-r.had.co.nz/Profiling.html#avoid-copies
# So: storing in a list, and rbinding only at the end
resultslist[[i]] <- df
}
resultsdf <- dplyr::rbind_all(resultslist)
resultsdf$team <- iconv(resultsdf$team, "UTF-8", "ISO-8859-1")
write.table(resultsdf,
file = "kmkisadata.csv",
sep = ";",
row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment