Skip to content

Instantly share code, notes, and snippets.

@franvillamil
Last active April 28, 2021 11:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save franvillamil/d0e81d059f8bfd1b87fe76ede9b47f34 to your computer and use it in GitHub Desktop.
Save franvillamil/d0e81d059f8bfd1b87fe76ede9b47f34 to your computer and use it in GitHub Desktop.
Scrap apellidos from INE & aggregate
setwd("...")
library(stringr)
library(RSelenium)
library(XML)
library(dplyr)
library(muniSpain)
options(stringsAsFactors = FALSE)
# Encoding problems, using Spanish enconding here
Sys.setlocale("LC_ALL", "ES_ES.UTF-8")
# APELLIDOS (NOTE: EXAMPLE)
apellidos = c("villamil", "garcia", "robledo")
### ==================================================================
### 1. SCRAPING (RSelenium)
url = "https://www.ine.es/dyngs/INEbase/es/operacion.htm?c=Estadistica_C&cid=1254736177009&menu=resultados&secc=1254736195497&idp=1254734710990#!tabs-1254736195497"
# Setting up RSelenium server etc
rd = rsDriver(port = 4444L, browser = "firefox")
remo_dr = rd$client
remo_dr$setTimeout(type = "page load", milliseconds = 100000)
# Create data.frame to fill
ape_df = data.frame(prov = NULL, primero = NULL,
segundo = NULL, ambos = NULL, apellido = NULL)
### FOR LOOP FOR EACH APELLIDO
for(i in 1:length(apellidos)){
# Loop status
print(paste0(i, " ----- ", round((i/length(apellidos))*100, 0), "%"))
# Go to website
remo_dr$navigate(url)
# Select text box, clean, click, and type in
text_box = remo_dr$findElement('xpath', '//*[contains(@value,"Escriba un apellido")]')
text_box$clearElement()
text_box$clickElement()
text_box$sendKeysToElement(list(apellidos[i]))
# Locate and click button (PROV NACIMIENTO)
button = remo_dr$findElement('xpath', '//*[@id="tapellidos1"]/p[2]/button[2]')
button$clickElement()
# Switch to new window and find table
pages = remo_dr$getWindowHandles()
remo_dr$switchToWindow(pages[[2]])
# Get page source
source = remo_dr$getPageSource()[[1]]
# Check if we have results; if not, fill with blank
if(grepl("No existen habitantes con el apellido consultado", source)){
# Fill blank
ape_df = rbind(ape_df, data.frame(prov = NA, primero = NA,
segundo = NA, ambos = NA, apellido = apellidos[i]))
# Switch to main window again and restat loop
remo_dr$closeWindow()
remo_dr$switchToWindow(pages[[1]])
next
}
# Find table in source
str = str_locate(source, '<table summary=\"resultados\"')[[1]][1]
end = str_locate_all(source, '</table>')[[1]][,2]
tmp = end - str
end = end[which(tmp == min(tmp[tmp>0]))]
source = str_sub(source, str, end)
# Parse and
source = htmlParse(source)
table = adapt(readHTMLTable(source)[[1]])
# Cleaning up table
if(table[1,1] == "Provincia" &
str_sub(table[2,6], 1, 7) == "Por mil"){
table = table[3:nrow(table),c(1,2,4,6)]
for(j in 2:ncol(table)){
table[,j] = as.integer(gsub("[^0-9]", "", table[,j]))
}
} else {
print("wtf?")
}
# Assigning column names, appending apellido
table$apellido = apellidos[i]
names(table)[names(table) == "V1"] = "prov"
names(table)[names(table) == "V2"] = "primero"
names(table)[names(table) == "V4"] = "segundo"
names(table)[names(table) == "V6"] = "ambos"
# Adapting province name
table$prov = adapt(table$prov, tolower = TRUE)
ape_df = rbind(ape_df, table)
# Wait a little bit (0.5-2 sec)
Sys.sleep(runif(1, min = 0.2, max = 0.8))
# Switch to main window again
remo_dr$closeWindow()
remo_dr$switchToWindow(pages[[1]])
}
# Adapting province name
ape_df$prov[ape_df$prov == "valencia/valaa\250ncia"] = "valencia"
ape_df$prov[ape_df$prov == "alicante/alacant"] = "alicante"
ape_df$prov[ape_df$prov == "almeraa\255a"] = "almeria"
ape_df$prov[ape_df$prov == "araba/aa\201lava"] = "alava"
ape_df$prov[ape_df$prov == "araba/a\u0081lava"] = "alava"
ape_df$prov[ape_df$prov == "castellaa\263n/castellaa\263"] = "castellon"
ape_df$prov[ape_df$prov == "caa\263rdoba"] = "cordoba"
ape_df$prov[ape_df$prov == "coruaa\261a, a"] = "a coruna"
ape_df$prov[ape_df$prov == "jaaa\251n"] = "jaen"
ape_df$prov[ape_df$prov == "leaa\263n"] = "leon"
ape_df$prov[ape_df$prov == "aa\201vila"] = "avila"
ape_df$prov[ape_df$prov == "a\u0081vila"] = "avila"
ape_df$prov[ape_df$prov == "balears, illes"] = "baleares"
ape_df$prov[ape_df$prov == "rioja, la"] = "la rioja"
ape_df$prov[ape_df$prov == "palmas, las"] = "las palmas"
ape_df$prov[ape_df$prov == "ca³rdoba"] = "cordoba"
ape_df$prov[ape_df$prov == "araba/a\u0081lava"] = "alava"
ape_df$prov[ape_df$prov == "a\u0081vila"] = "avila"
ape_df$prov[ape_df$prov == "castella³n/castella³"] = "castellon"
ape_df$prov[ape_df$prov == "corua±a, a"] = "a coruna"
ape_df$prov[ape_df$prov == "jaa©n"] = "jaen"
ape_df$prov[ape_df$prov == "lea³n"] = "leon"
ape_df$prov[ape_df$prov == "valencia/vala¨ncia"] = "valencia"
ape_df$prov[ape_df$prov == "almera­a"] = "almeria"
ape_df$prov[ape_df$prov == "almera­a "] = "almeria"
ape_df$prov[ape_df$prov == "castella³n/castella³"] = "castellon"
ape_df$prov[ape_df$prov == "ca³rdoba"] = "cordoba"
ape_df$prov[ape_df$prov == "corua±a, a"] = "a coruna"
ape_df$prov[ape_df$prov == "jaa©n"] = "jaen"
ape_df$prov[ape_df$prov == "lea³n"] = "leon"
ape_df$prov[ape_df$prov == "valencia/vala¨ncia"] = "valencia"
# Column with total hits
ape_df$total = rowSums(ape_df[,c("primero","segundo","ambos")], na.rm = TRUE)
# Saving raw data
write.csv(ape_df, "apellidos.csv", row.names = FALSE)
### ==================================================================
### 2. AGGREGATING: TOTAL & CAT/EUK
# Define Cat & Euk provinces
cat_p = c("barcelona", "girona", "lleida", "tarragona")
euk_p = c("gipuzkoa", "alava", "bizkaia", "navarra")
ape_agr = ape_df %>%
# Aggregate (either 1st, 2nd or both apellidos)
group_by(apellido) %>%
summarize(
cat = sum(total[provincia %in% cat_p], na.rm = T),
euk = sum(total[provincia %in% euk_p], na.rm = T),
total = sum(total[provincia == "total"])) %>%
# Turning to %
mutate(cat = cat/total,
euk = euk/total)
# Save
write.csv(ape_agr, "apellidos_aggregated_cat_euk.csv", row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment