Skip to content

Instantly share code, notes, and snippets.

@sillasgonzaga
Last active November 21, 2017 21:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sillasgonzaga/2d626f33ee1b635a0aa3beeda31ae720 to your computer and use it in GitHub Desktop.
Save sillasgonzaga/2d626f33ee1b635a0aa3beeda31ae720 to your computer and use it in GitHub Desktop.
grafico_tfidf <- function(data, n_grams = 1, remover_nomes_proprios,
agregar_por_noticia, remover_stop_words = TRUE){
data <- data %>%
unnest_tokens(palavra, corpo_noticia, token = "ngrams", n = n_grams, to_lower = FALSE)
if (agregar_por_noticia){
# remover as duplicatas dentre de uma mesma noticia
data %<>% distinct(url, palavra, .keep_all = TRUE)
}
if (remover_nomes_proprios) {
data %<>% filter(palavra == str_to_lower(palavra) | palavra == str_to_upper(palavra))
}
if (remover_stop_words) {
data %<>% filter(!palavra %in% tm::stopwords("pt"))
}
data %>%
# manter apenas palavras que nao tem upper case
count(documento, palavra) %>%
bind_tf_idf(term = palavra, document = documento, n = n) %>%
group_by(documento) %>%
arrange(desc(tf_idf)) %>%
filter(row_number() <= 20) %>%
ungroup() %>%
arrange(desc(tf_idf)) %>%
mutate(palavra = factor(palavra, levels = rev(unique(palavra)))) %>%
ggplot(aes(palavra, tf_idf, fill = documento)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), hjust = 1.3) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~documento, ncol = 2, scales = "free") +
coord_flip()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment