kguidonimartins/raspagem_site_342.R

## raspagem_site_342.R
# AUTOR: KARLO GUIDONI MARTINS
# CONTRIBUIÇÃO: GUSTAVO HENRIQUE DE CARVALHO
# DATA: 07 DE AGOSTO DE 2017

# RASPAGEM NOMES, DECISÕES, ESTADOS E PARTIDOS DOS DEPUTADOS FEDERAIS.
# DADOS DISPONÍVEIS NO SÍTIO ELETRÔNICO: "https://342agora.org.br/"

ipak <- function(pkg)
{
    # https://gist.github.com/stevenworthington/3178163
    # ipak function: install and load multiple R packages.
    # check to see if packages are installed.
    # Install them if they are not, then load them into the R session.
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg))
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}

ipak(c("rvest",
       "stringr",
       "dplyr"))

# A identificação do elementos da página foi feita seguindo o tutorial
# disponível em:
# https://blog.rstudio.com/2014/11/24/rvest-easy-web-scraping-with-r/

# Navegue até a página "https://342agora.org.br/"
# Repare que ao clicar nas listas dos deputados,
# você será redirecionad@ para uma url diferente.
# Esta nova url pode conter as seguintes palavras:
condicao <- c("contra", "a-favor", "ausente")

# Com isso em mente, podemos fazer um laço for para raspar
# os dados de cada página.

# Quero as seguintes informações:
nome    <- list()
decisao <- list()
estado  <- list()
partido <- list()
emails  <- list()

# INÍCIO DA RASPAGEM
for(i in 1:length(condicao))
{
    url <- paste0("https://342agora.org.br/", condicao[i], "/")

    webpage <- read_html(url)

    # raspando os nomes dos deputados:
    webpage %>%
        html_nodes('.makepressure_title') %>%
        html_text() %>%
        as.data.frame() -> nome[i]

    # criando um vetor com a decisão de cada deputado:
    decisao[[i]] <-
        rep(x = condicao[i], times = length(nome[[i]]))

    # raspando e limpando os Estados pelos quais os deputados foram eleitos:
    webpage %>%
        html_nodes('.makepressure_upper') %>%
        html_text() %>%
        gsub("[\n\t]", "", .) %>%
        word(start = 1,
             end = 3,
             sep = "") %>%
        toupper() %>%
        stringr::str_trim(side = "both") %>%
        as.data.frame() -> estado[i]

    # raspando e limpando o nome dos partidos de cada deputado
    webpage %>%
        html_nodes('.makepressure_upper') %>%
        html_text() %>%
        gsub("[\n\t]", "", .) %>%
        word(start = 2, sep = fixed("/")) %>%
        toupper() %>%
        stringr::str_trim(side = "both") %>%
        as.data.frame() -> partido[i]

    # raspando emails (Contribuição: Gustavo Henrique de Carvalho)
    # https://gist.github.com/gustavobio/883393808d3aed586d41a281248eaa6d
    webpage %>%
        html_nodes(".makepressure_gmail") %>%
        html_attr("href") -> full
    emails[[i]] <- regmatches(full, regexpr("(?<=to=).*(?=&su)", full, perl = T))
}
# FIM DA RASPAGEM

# data.frame com os resultados
deputados <- data.frame(unlist(nome),
                        unlist(decisao),
                        unlist(estado),
                        unlist(partido),
                        unlist(emails))

# renomeando colunas
colnames(deputados) <- c("nome",
                         "decisao",
                         "estado",
                         "partido",
                         "email")

# estrutura do data.frame
deputados %>% str()

# amostras do data.frame
deputados %>% head(10)
deputados %>% tail(10)

# filtrando
deputados %>%
    filter(decisao == "contra" & estado == "MG")

deputados %>%
    filter(decisao == "a-favor" & estado == "ES")

deputados %>%
    filter(decisao == "ausente" & estado == "SP")

deputados %>%
    filter(decisao == "a-favor" & estado == "SP" & partido == "PSDB")

deputados %>%
    filter(decisao == "contra" & partido == "PMDB")

deputados %>%
    filter(decisao == "a-favor" & partido == "PMDB")
	# AUTOR: KARLO GUIDONI MARTINS
	# CONTRIBUIÇÃO: GUSTAVO HENRIQUE DE CARVALHO
	# DATA: 07 DE AGOSTO DE 2017

	# RASPAGEM NOMES, DECISÕES, ESTADOS E PARTIDOS DOS DEPUTADOS FEDERAIS.
	# DADOS DISPONÍVEIS NO SÍTIO ELETRÔNICO: "https://342agora.org.br/"

	ipak <- function(pkg)
	{
	# https://gist.github.com/stevenworthington/3178163
	# ipak function: install and load multiple R packages.
	# check to see if packages are installed.
	# Install them if they are not, then load them into the R session.
	new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
	if (length(new.pkg))
	install.packages(new.pkg, dependencies = TRUE)
	sapply(pkg, require, character.only = TRUE)
	}

	ipak(c("rvest",
	"stringr",
	"dplyr"))

	# A identificação do elementos da página foi feita seguindo o tutorial
	# disponível em:
	# https://blog.rstudio.com/2014/11/24/rvest-easy-web-scraping-with-r/

	# Navegue até a página "https://342agora.org.br/"
	# Repare que ao clicar nas listas dos deputados,
	# você será redirecionad@ para uma url diferente.
	# Esta nova url pode conter as seguintes palavras:
	condicao <- c("contra", "a-favor", "ausente")

	# Com isso em mente, podemos fazer um laço for para raspar
	# os dados de cada página.

	# Quero as seguintes informações:
	nome <- list()
	decisao <- list()
	estado <- list()
	partido <- list()
	emails <- list()

	# INÍCIO DA RASPAGEM
	for(i in 1:length(condicao))
	{
	url <- paste0("https://342agora.org.br/", condicao[i], "/")

	webpage <- read_html(url)

	# raspando os nomes dos deputados:
	webpage %>%
	html_nodes('.makepressure_title') %>%
	html_text() %>%
	as.data.frame() -> nome[i]

	# criando um vetor com a decisão de cada deputado:
	decisao[[i]] <-
	rep(x = condicao[i], times = length(nome[[i]]))

	# raspando e limpando os Estados pelos quais os deputados foram eleitos:
	webpage %>%
	html_nodes('.makepressure_upper') %>%
	html_text() %>%
	gsub("[\n\t]", "", .) %>%
	word(start = 1,
	end = 3,
	sep = "") %>%
	toupper() %>%
	stringr::str_trim(side = "both") %>%
	as.data.frame() -> estado[i]

	# raspando e limpando o nome dos partidos de cada deputado
	webpage %>%
	html_nodes('.makepressure_upper') %>%
	html_text() %>%
	gsub("[\n\t]", "", .) %>%
	word(start = 2, sep = fixed("/")) %>%
	toupper() %>%
	stringr::str_trim(side = "both") %>%
	as.data.frame() -> partido[i]

	# raspando emails (Contribuição: Gustavo Henrique de Carvalho)
	# https://gist.github.com/gustavobio/883393808d3aed586d41a281248eaa6d
	webpage %>%
	html_nodes(".makepressure_gmail") %>%
	html_attr("href") -> full
	emails[[i]] <- regmatches(full, regexpr("(?<=to=).*(?=&su)", full, perl = T))
	}
	# FIM DA RASPAGEM

	# data.frame com os resultados
	deputados <- data.frame(unlist(nome),
	unlist(decisao),
	unlist(estado),
	unlist(partido),
	unlist(emails))

	# renomeando colunas
	colnames(deputados) <- c("nome",
	"decisao",
	"estado",
	"partido",
	"email")

	# estrutura do data.frame
	deputados %>% str()

	# amostras do data.frame
	deputados %>% head(10)
	deputados %>% tail(10)

	# filtrando
	deputados %>%
	filter(decisao == "contra" & estado == "MG")

	deputados %>%
	filter(decisao == "a-favor" & estado == "ES")

	deputados %>%
	filter(decisao == "ausente" & estado == "SP")

	deputados %>%
	filter(decisao == "a-favor" & estado == "SP" & partido == "PSDB")

	deputados %>%
	filter(decisao == "contra" & partido == "PMDB")

	deputados %>%
	filter(decisao == "a-favor" & partido == "PMDB")