Skip to content

Instantly share code, notes, and snippets.

@franvillamil
Last active September 26, 2017 15:28
Show Gist options
  • Save franvillamil/8a6c85a0539108f0b0d8db27875a40ba to your computer and use it in GitHub Desktop.
Save franvillamil/8a6c85a0539108f0b0d8db27875a40ba to your computer and use it in GitHub Desktop.
R code to scrap the INE website and get census data
setwd("~")
library(rvest)
library(stringr)
# ---------------
# FUNCTIONS
prov_code_to_name = function(x){
p = c("alava", "albacete", "alicante", "almeria", "avila",
"badajoz", "baleares", "barcelona", "burgos", "caceres",
"cadiz", "castellon", "ciudad real", "cordoba", "a coruna",
"cuenca", "girona", "granada", "guadalajara", "gipuzkoa",
"huelva", "huesca", "jaen", "leon", "lleida",
"la rioja", "lugo", "madrid", "malaga", "murcia",
"navarra", "ourense", "asturias", "palencia", "las palmas",
"pontevedra", "salamanca", "santa cruz de tenerife", "cantabria", "segovia",
"sevilla", "soria", "tarragona", "teruel", "toledo",
"valencia", "valladolid", "bizkaia", "zamora", "zaragoza")
return(p[x])
}
unicode = function(x){
x = gsub("\u00d1", "N", x) # Ñ
x = gsub("\u00c7", "C", x) # Ç
x = gsub("\u00c0", "A", x) # À
x = gsub("\u00c1", "A", x) # Á
x = gsub("\u00c4", "A", x) # Ä
x = gsub("\u00c8", "E", x) # È
x = gsub("\u00c9", "E", x) # É
x = gsub("\u00cc", "I", x) # Ì
x = gsub("\u00cd", "I", x) # Í
x = gsub("\u00cf", "I", x) # Ï
x = gsub("\u00d2", "O", x) # Ò
x = gsub("\u00d3", "O", x) # Ó
x = gsub("\u00d6", "O", x) # Ö
x = gsub("\u00d9", "U", x) # Ù
x = gsub("\u00da", "U", x) # Ú
x = gsub("\u00db", "U", x) # Û
x = gsub("\u00dc", "U", x) # Ü
x = gsub("\u00f1", "n", x) # ñ
x = gsub("\u00e7", "c", x) # ç
x = gsub("\u00e0", "a", x) # à
x = gsub("\u00e1", "a", x) # á
x = gsub("\u00e4", "a", x) # ä
x = gsub("\u00e8", "e", x) # è
x = gsub("\u00e9", "e", x) # é
x = gsub("\u00ec", "i", x) # ì
x = gsub("\u00ed", "i", x) # í
x = gsub("\u00ef", "i", x) # ï
x = gsub("\u00f2", "o", x) # ò
x = gsub("\u00f3", "o", x) # ó
x = gsub("\u00f6", "o", x) # ö
x = gsub("\u00f9", "u", x) # ù
x = gsub("\u00fa", "u", x) # ú
x = gsub("\u00fb", "u", x) # û
x = gsub("\u00fc", "u", x) # ü
return(x)
}
# ---------------
url = "http://www.ine.es/intercensal/inicio.do?regIni=51&regFin=100&L=1"
for (j in 1:50){
census_data = data.frame()
prov_code = j
for (i in 1:999){
muni_code = i
print(i)
pgsession <- html_session(url)
pgform = html_form(pgsession)[[3]]
pgform = set_values(pgform, 'codigoProvincia' = prov_code, 'codigoMunicipio' = muni_code)
resp = submit_form(pgsession, pgform)
resp2 = httr::content(resp$response)
resp2 %>%
html_nodes(".TITULOH3") %>%
html_text() -> title
if(length(title) != 0){
muni_code = sprintf("%05.0f",
as.integer(paste0(prov_code, sprintf("%03.0f", muni_code))) )
muni_name = gsub("\r|\n|(\u00a0)|", "", title)
muni_name = gsub("\\s+", " ", muni_name) # more than 1 space
muni_name = gsub(" $", "", muni_name) # final space
muni_name = str_sub(muni_name,
str_locate(muni_name, muni_code)[,2] + 1, -1L)
muni_name = unicode(muni_name)
prov_name = prov_code_to_name(prov_code)
resp2 %>%
html_nodes("table") %>%
html_table(fill = TRUE) -> muni_data
muni_data = muni_data[[2]]
for (i in 1:ncol(muni_data)){
muni_data[,i] = gsub("\r|\n|(\u00a0)| ", "", muni_data[,i])}
pop_data = as.numeric(muni_data[3, 2:ncol(muni_data)])
year_data = as.character(muni_data[1, 2:ncol(muni_data)])
year_data = gsub("\\[.\\]|\\(.\\)", "", year_data)
if (length(year_data) != 18){
print(paste0("Warning: ", length(year_data), " columns"))
pop_data = pop_data[!duplicated(year_data)]
year_data = year_data[!duplicated(year_data)]
}
output = as.data.frame(rbind(pop_data))
names(output) = paste0("c", year_data)
rownames(output) = muni_code
output = cbind(prov_code, prov_name, muni_code, muni_name, output)
census_data = rbind(census_data, output)
}
}
file = paste0("prov_files/", prov_code_to_name(prov_code), ".csv")
write.csv(census_data, file, row.names = FALSE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment