Skip to content

Instantly share code, notes, and snippets.

@leobarone
Last active June 26, 2017 20:02
Show Gist options
  • Save leobarone/14662767ca2cc685c345961e55f9b553 to your computer and use it in GitHub Desktop.
Save leobarone/14662767ca2cc685c345961e55f9b553 to your computer and use it in GitHub Desktop.
library(dplyr)
library(rvest)
library(stringr)
url_tabela_discursos <- "http://www.camara.leg.br/internet/sitaqweb/resultadoPesquisaDiscursos.asp?txOrador=&txPartido=&txUF=&dtInicio=17%2F04%2F2016&dtFim=17%2F04%2F2016&txTexto=&txSumario=&basePesq=plenario&CampoOrdenacao=dtSessao&PageSize=5000&TipoOrdenacao=DESC&btnPesq=Pesquisar"
url_discursos <- url_tabela_discursos %>%
read_html() %>%
html_nodes(xpath = "//table[@class ='table table-bordered variasColunas']//td/a") %>%
html_attr(name = "href")
url_discursos <- str_c("http://www.camara.leg.br/internet/sitaqweb/", url_discursos)
url_discursos <- str_replace_all(url_discursos, " ", "")
url_discursos <- str_replace_all(url_discursos, "\r", "")
url_discursos <- str_replace_all(url_discursos, "\n", "")
url_discursos <- str_subset(url_discursos, "http://www.camara.leg.br/internet/sitaqweb/TextoHTML.asp?")
discursos <- c()
for (url_discurso in url_discursos) {
discurso <- url_discurso %>%
read_html() %>%
html_nodes(xpath = "//div[@id = 'content']//p") %>%
html_text()
discursos <- c(discursos, discurso)
Sys.sleep(0.5)
}
library(tidytext)
library(ggplot2)
library(tidyr)
library(tm)
discursos_df <- data_frame(id_discurso = 1:length(discursos),
text = discursos)
discurso_bigrams <- discursos_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
discurso_bigrams %>%
count(bigram, sort = TRUE)
bigrams_separated <- discurso_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
stopwords_pt <- c(stopwords("pt"), "é")
stopwords_pt_df <- data.frame(word = stopwords_pt)
bigrams_filtered <- bigrams_separated %>%
anti_join(stopwords_pt_df, by = c("word1" = "word")) %>%
anti_join(stopwords_pt_df, by = c("word2" = "word"))
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
library(igraph)
library(ggraph)
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment