sergiospagnuolo/analise_termos.R Secret

## analise_termos.R
suppressMessages(library(tidyverse))
suppressMessages(library(tidytext))
suppressMessages(library(wordcloud2))
suppressMessages(library(lubridate))
library(ngram)

d <- read.csv("texto_pl_2630.txt", header = F, sep = "&")

######################################################################################################################################################################################### STOPWORDS

stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)

bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram

#################################
############ GERA OS BIGRAMAS
#################################
bigrama <- d %>%
    unnest_tokens(bigram, V1, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(
      !word1 %in% stopwords_pt$word,         # remove stopwords from both words in bi-gram
      !word2 %in% stopwords_pt$word,
      !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
      !str_detect(word2, pattern = "[[:digit:]]"),
      !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
      !str_detect(word2, pattern = "[[:punct:]]"),
      !str_detect(word1, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
      !str_detect(word2, pattern = "(.)\\1{2,}"),
      !str_detect(word1, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
      !str_detect(word2, pattern = "\\b(.)\\b")
    ) %>%
    unite("bigram", c(word1, word2), sep = " ") %>%
    count(bigram) %>%
    filter(n >= 2) %>%
    #slice_max(n, n = 100) %>%
    #filter(!bigram %in% bigram_stop) %>%
    mutate(bigram = str_to_upper(bigram)) %>%
    arrange(desc(n)) %>%
    rename(freq = n, words = bigram)


unigrama <- d %>%
    unnest_tokens(unigram, V1, token = "ngrams", n = 1) %>%
    filter(
      !unigram %in% stopwords_pt$word,         # remove stopwords from both words in bi-gram
      !str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
      !str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
      !str_detect(unigram, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
      !str_detect(unigram, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
    ) %>%
    count(unigram) %>%
    filter(n >= 4) %>%
    #slice_max(n, n = 100) %>%
    filter(!unigram %in% stopwords_pt) %>%
    mutate(unigram = str_to_upper(unigram)) %>%
    arrange(desc(n)) %>%
    rename(freq = n, words = unigram)
	suppressMessages(library(tidyverse))
	suppressMessages(library(tidytext))
	suppressMessages(library(wordcloud2))
	suppressMessages(library(lubridate))
	library(ngram)

	d <- read.csv("texto_pl_2630.txt", header = F, sep = "&")

	######################################################################################################################################################################################### STOPWORDS

	stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)

	bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram

	#################################
	############ GERA OS BIGRAMAS
	#################################
	bigrama <- d %>%
	unnest_tokens(bigram, V1, token = "ngrams", n = 2) %>%
	separate(bigram, c("word1", "word2"), sep = " ") %>%
	filter(
	!word1 %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
	!word2 %in% stopwords_pt$word,
	!str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
	!str_detect(word2, pattern = "[[:digit:]]"),
	!str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
	!str_detect(word2, pattern = "[[:punct:]]"),
	!str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
	!str_detect(word2, pattern = "(.)\\1{2,}"),
	!str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
	!str_detect(word2, pattern = "\\b(.)\\b")
	) %>%
	unite("bigram", c(word1, word2), sep = " ") %>%
	count(bigram) %>%
	filter(n >= 2) %>%
	#slice_max(n, n = 100) %>%
	#filter(!bigram %in% bigram_stop) %>%
	mutate(bigram = str_to_upper(bigram)) %>%
	arrange(desc(n)) %>%
	rename(freq = n, words = bigram)


	unigrama <- d %>%
	unnest_tokens(unigram, V1, token = "ngrams", n = 1) %>%
	filter(
	!unigram %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
	!str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
	!str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
	!str_detect(unigram, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
	!str_detect(unigram, pattern = "\\b(.)\\b"), # removes any remaining single letter words
	) %>%
	count(unigram) %>%
	filter(n >= 4) %>%
	#slice_max(n, n = 100) %>%
	filter(!unigram %in% stopwords_pt) %>%
	mutate(unigram = str_to_upper(unigram)) %>%
	arrange(desc(n)) %>%
	rename(freq = n, words = unigram)