Skip to content

Instantly share code, notes, and snippets.

@patperu
Last active August 29, 2015 14:18
Show Gist options
  • Select an option

  • Save patperu/60067bc8a391af09932b to your computer and use it in GitHub Desktop.

Select an option

Save patperu/60067bc8a391af09932b to your computer and use it in GitHub Desktop.
library('rvest') # >= rvest 0.2.0.9000
library('dplyr')
library('stringr')
library('pbapply')
options(stringsAsFactors = FALSE)
###########################################################################################
Get_Provincia_pagine <- function() {
count_pages <- function(url) {
# Return the number of pages from the table footer
# e.g. "Pagina 1 di 59 Inizio Prec. 1 2 ... Succ. Fine"
# return value should be '59'
x <- html_text(html_node(read_html(url), "div.num_pagine"))
# http://unix.stackexchange.com/a/92543
x <- regmatches(x, regexpr("in\\K.*?(?=pagine|$)", x, perl = TRUE))
as.numeric(str_trim(x))
}
num_pages <- function(prov_id) {
x <- unlist(lapply(as.list(prov_id), function(x) {
url_01 <- "http://www.agenziadoganemonopoli.gov.it/wps/wcm/connect/Internet/ed/Monopoli/Giochi/Apparecchi_intr/Elenco_soggetti_Ries/?pagina=1&id_pagina=&prov="
url_02 <- "&anno=0&tipo_app=&el=2&CACHE=NONE"
count_pages(paste0(url_01, x, url_02))
}))
return(x)
}
data_url <- "http://www.agenziadoganemonopoli.gov.it/wps/wcm/connect/Internet/ed/Monopoli/Giochi/Apparecchi_intr/Elenco_soggetti_Ries/?el=2&CACHE=NONE"
x <- html_form(html_session(data_url))
x <- data.frame(prov_id = x[[2]]$fields$prov$options, stringsAsFactors = FALSE)
x$provincia <- rownames(x)
x <- x[x$prov_id != 0, ]
x <- x[order(x$prov_id), ]
rownames(x) <- NULL
x$prov_pagine <- num_pages(x$prov_id)
return(x)
}
conv_numeric <- function(x) {
as.numeric( gsub(",", ".", x) )
}
conv_colnames <- function(x) {
names(x) <- gsub(" ", "_", names(x))
return(x)
}
get_data <- function(prov_id, prov_pagine) {
message("Provincia: ", prov_id, " // Pagine: ", prov_pagine)
str_prov <- paste0("prov=", prov_id)
pblapply(1:prov_pagine, function(page) {
data_url_page <- html(paste0(url01, page, url02, str_prov, url03))
mm <- html_table(html_node(data_url_page, "table.tabella_d.reduce70"))
mm$PROV_ID <- prov_id
return(mm)
})
}
###########################################################################################
url01 <- "http://www.agenziadoganemonopoli.gov.it/wps/wcm/connect/Internet/ed/Monopoli/Giochi/Apparecchi_intr/Elenco_soggetti_Ries/?pagina="
url02 <- "&id_pagina=&"
url03 <- "&anno=0&tipo_app=&el=2&CACHE=NONE"
start.time <- Sys.time()
x <- Get_Provincia_pagine()
fin <- mapply(function(prov_id, prov_pagine)
get_data(prov_id, prov_pagine),
x$prov_id, x$prov_pagine)
fin <- do.call("rbind", lapply(fin, function(m) Reduce("rbind", m)))
fin <- tbl_df(fin) %>%
conv_colnames() %>%
mutate(SUPERFICIE_DEL_LOCALE_IN_MQ = conv_numeric(SUPERFICIE_DEL_LOCALE_IN_MQ),
ANNO = 2014)
end.time <- Sys.time()
time.taken <- end.time - start.time
outfile <- paste0("Elenco_soggetti_per_esercizi_",
format(start.time, "%Y_%m_%d"), ".rdata")
save(x, fin, time.taken, file = outfile)
#
# FINI
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment