Skip to content

Instantly share code, notes, and snippets.

@sergiospagnuolo
Created April 28, 2023 17:28
Show Gist options
  • Save sergiospagnuolo/2908577317ac15033aca28bf433d5b10 to your computer and use it in GitHub Desktop.
Save sergiospagnuolo/2908577317ac15033aca28bf433d5b10 to your computer and use it in GitHub Desktop.
analise de termos do PL das Fake News
suppressMessages(library(tidyverse))
suppressMessages(library(tidytext))
suppressMessages(library(wordcloud2))
suppressMessages(library(lubridate))
library(ngram)
d <- read.csv("texto_pl_2630.txt", header = F, sep = "&")
######################################################################################################################################################################################### STOPWORDS
stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)
bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram
#################################
############ GERA OS BIGRAMAS
#################################
bigrama <- d %>%
unnest_tokens(bigram, V1, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(
!word1 %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
!word2 %in% stopwords_pt$word,
!str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
!str_detect(word2, pattern = "[[:digit:]]"),
!str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
!str_detect(word2, pattern = "[[:punct:]]"),
!str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
!str_detect(word2, pattern = "(.)\\1{2,}"),
!str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
!str_detect(word2, pattern = "\\b(.)\\b")
) %>%
unite("bigram", c(word1, word2), sep = " ") %>%
count(bigram) %>%
filter(n >= 2) %>%
#slice_max(n, n = 100) %>%
#filter(!bigram %in% bigram_stop) %>%
mutate(bigram = str_to_upper(bigram)) %>%
arrange(desc(n)) %>%
rename(freq = n, words = bigram)
unigrama <- d %>%
unnest_tokens(unigram, V1, token = "ngrams", n = 1) %>%
filter(
!unigram %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
!str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
!str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
!str_detect(unigram, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
!str_detect(unigram, pattern = "\\b(.)\\b"), # removes any remaining single letter words
) %>%
count(unigram) %>%
filter(n >= 4) %>%
#slice_max(n, n = 100) %>%
filter(!unigram %in% stopwords_pt) %>%
mutate(unigram = str_to_upper(unigram)) %>%
arrange(desc(n)) %>%
rename(freq = n, words = unigram)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment