Skip to content

Instantly share code, notes, and snippets.

@sergiospagnuolo
Last active November 23, 2022 21:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergiospagnuolo/a706ecc55f9cc519066f574700c51e8b to your computer and use it in GitHub Desktop.
Save sergiospagnuolo/a706ecc55f9cc519066f574700c51e8b to your computer and use it in GitHub Desktop.
suppressMessages(library(odbc))
suppressMessages(library(DBI))
suppressMessages(library(RPostgreSQL))
suppressMessages(library(tidyverse))
suppressMessages(library(tidytext))
suppressMessages(library(wordcloud2))
suppressMessages(library(lubridate))
library(config)
library(ngram)
print(paste("Batch Wordcloud iniciado - ", Sys.time() - hours(3)))
db <- dbConnect(RPostgres::Postgres(),
dbname = "postgres",
host = "",
port = 5432,
user = "",
password = "")
d <- dbGetQuery(db, 'SELECT "id", "title", "date", "authorityId"
FROM "new_agenda-transparente"."scraped_schedule"
WHERE "date" > NOW()::date - INTERVAL \'720 days\'
ORDER BY "date" DESC')
######################################################################################################################################################################################### STOPWORDS
stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)
bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram
#################################
############ GERA OS BIGRAMAS
#################################
# bigrama <- function(d) {
# c <- d %>%
# unnest_tokens(bigram, title, token = "ngrams", n = 2) %>%
# separate(bigram, c("word1", "word2"), sep = " ") %>%
# filter(
# !word1 %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
# !word2 %in% stopwords_pt$word,
# !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
# !str_detect(word2, pattern = "[[:digit:]]"),
# !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
# !str_detect(word2, pattern = "[[:punct:]]"),
# !str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
# !str_detect(word2, pattern = "(.)\\1{2,}"),
# !str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
# !str_detect(word2, pattern = "\\b(.)\\b")
# ) %>%
# unite("bigram", c(word1, word2), sep = " ") %>%
# count(bigram, authorityId) %>%
# filter(n >= 2) %>%
# #slice_max(n, n = 100) %>%
# #filter(!bigram %in% bigram_stop) %>%
# mutate(bigram = str_to_upper(bigram)) %>%
# arrange(desc(n)) %>%
# rename(freq = n, words = bigram)
# }
unigrama <- function(d) {
d %>%
unnest_tokens(unigram, title, token = "ngrams", n = 1) %>%
filter(
!unigram %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
!str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
!str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
!str_detect(unigram, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
!str_detect(unigram, pattern = "\\b(.)\\b"), # removes any remaining single letter words
) %>%
count(unigram, authorityId) %>%
filter(n >= 6) %>%
#slice_max(n, n = 100) %>%
filter(!unigram %in% stopwords_pt) %>%
mutate(unigram = str_to_upper(unigram)) %>%
arrange(desc(n)) %>%
rename(freq = n, words = unigram)
}
#################################
############ GERA O DATA FRAME
#################################
a <- d %>%
group_split(authorityId) %>%
purrr::map_df(unigrama)
#purrr::map_df(bigrama)
# a <- a %>% filter(freq > 15)
# set.seed(seed = 14412)
# a$id <- as.vector(sample(1000000000, size = nrow(a), replace = TRUE))
# a <- a %>% select(id, words, authorityId, freq)
#####################################
############ GRAVA NO BANCO DE DADOS
#####################################
# DELETE TEMPORARY TABLE (before writing it)
dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')
# WRITE TABLE WITH TEMPORARY DATA FRESH FROM THE DB
dbWriteTable(db, 'wordcloud_agendas_temp', a, row.names=F)
# INSERE NOVOS DADOS NA TABELA PRINCIPAL
dbExecute(db, '
DELETE FROM public.wordcloud_agendas;
')
dbExecute(db, '
INSERT INTO public.wordcloud_agendas (words,"authorityId",freq)
SELECT "words", "authorityId", "freq"::integer
FROM public.wordcloud_agendas_temp;
')
# DELETE TEMPORARY TABLE (after writing it)
dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')
#################################
############ FECHA DB
#################################
dbDisconnect(db)
rm(list = ls())
print(paste("Batch Wordcloud encerrado - ", Sys.time() - hours(3)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment