sergiospagnuolo/wordcloud_gen_agenda.R

## wordcloud_gen_agenda.R
suppressMessages(library(odbc))
suppressMessages(library(DBI))
suppressMessages(library(RPostgreSQL))
suppressMessages(library(tidyverse))
suppressMessages(library(tidytext))
suppressMessages(library(wordcloud2))
suppressMessages(library(lubridate))
library(config)
library(ngram)

print(paste("Batch Wordcloud iniciado - ", Sys.time() - hours(3)))

db <- dbConnect(RPostgres::Postgres(),
                dbname = "postgres",
                host = "",
                port = 5432,
                user = "",
                password = "")

d <- dbGetQuery(db, 'SELECT "id", "title", "date", "authorityId"
                     FROM "new_agenda-transparente"."scraped_schedule"
                     WHERE "date" > NOW()::date - INTERVAL \'720 days\'
                     ORDER BY "date" DESC')

######################################################################################################################################################################################### STOPWORDS

stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)

bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram

#################################
############ GERA OS BIGRAMAS
#################################
# bigrama <- function(d) {
#   c <- d %>%
#     unnest_tokens(bigram, title, token = "ngrams", n = 2) %>%
#     separate(bigram, c("word1", "word2"), sep = " ") %>%
#     filter(
#       !word1 %in% stopwords_pt$word,         # remove stopwords from both words in bi-gram
#       !word2 %in% stopwords_pt$word,
#       !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
#       !str_detect(word2, pattern = "[[:digit:]]"),
#       !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
#       !str_detect(word2, pattern = "[[:punct:]]"),
#       !str_detect(word1, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
#       !str_detect(word2, pattern = "(.)\\1{2,}"),
#       !str_detect(word1, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
#       !str_detect(word2, pattern = "\\b(.)\\b")
#     ) %>%
#     unite("bigram", c(word1, word2), sep = " ") %>%
#     count(bigram, authorityId) %>%
#     filter(n >= 2) %>%
#     #slice_max(n, n = 100) %>%
#     #filter(!bigram %in% bigram_stop) %>%
#     mutate(bigram = str_to_upper(bigram)) %>%
#     arrange(desc(n)) %>%
#     rename(freq = n, words = bigram)
# }


unigrama <- function(d) {
  d %>%
    unnest_tokens(unigram, title, token = "ngrams", n = 1) %>%
    filter(
      !unigram %in% stopwords_pt$word,         # remove stopwords from both words in bi-gram
      !str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
      !str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
      !str_detect(unigram, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
      !str_detect(unigram, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
    ) %>%
    count(unigram, authorityId) %>%
    filter(n >= 6) %>%
    #slice_max(n, n = 100) %>%
    filter(!unigram %in% stopwords_pt) %>%
    mutate(unigram = str_to_upper(unigram)) %>%
    arrange(desc(n)) %>%
    rename(freq = n, words = unigram)
}

#################################
############ GERA O DATA FRAME
#################################
a <- d %>%
  group_split(authorityId) %>%
  purrr::map_df(unigrama)
  #purrr::map_df(bigrama)

# a <- a %>% filter(freq > 15)
# set.seed(seed = 14412)
# a$id <- as.vector(sample(1000000000, size = nrow(a), replace = TRUE))
# a <- a %>% select(id, words, authorityId, freq)

#####################################
############ GRAVA NO BANCO DE DADOS
#####################################

# DELETE TEMPORARY TABLE (before writing it)
dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')

# WRITE TABLE WITH TEMPORARY DATA FRESH FROM THE DB
dbWriteTable(db, 'wordcloud_agendas_temp', a, row.names=F)

# INSERE NOVOS DADOS NA TABELA PRINCIPAL
dbExecute(db, '
          DELETE FROM public.wordcloud_agendas;
          ')

dbExecute(db, '
         INSERT INTO public.wordcloud_agendas (words,"authorityId",freq)
 SELECT "words", "authorityId", "freq"::integer
            FROM public.wordcloud_agendas_temp;
          ')

# DELETE TEMPORARY TABLE (after writing it)
dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')


#################################
############ FECHA DB
#################################

dbDisconnect(db)

rm(list = ls())

print(paste("Batch Wordcloud encerrado - ", Sys.time() - hours(3)))
	suppressMessages(library(odbc))
	suppressMessages(library(DBI))
	suppressMessages(library(RPostgreSQL))
	suppressMessages(library(tidyverse))
	suppressMessages(library(tidytext))
	suppressMessages(library(wordcloud2))
	suppressMessages(library(lubridate))
	library(config)
	library(ngram)

	print(paste("Batch Wordcloud iniciado - ", Sys.time() - hours(3)))

	db <- dbConnect(RPostgres::Postgres(),
	dbname = "postgres",
	host = "",
	port = 5432,
	user = "",
	password = "")

	d <- dbGetQuery(db, 'SELECT "id", "title", "date", "authorityId"
	FROM "new_agenda-transparente"."scraped_schedule"
	WHERE "date" > NOW()::date - INTERVAL \'720 days\'
	ORDER BY "date" DESC')

	######################################################################################################################################################################################### STOPWORDS

	stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)

	bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram

	#################################
	############ GERA OS BIGRAMAS
	#################################
	# bigrama <- function(d) {
	# c <- d %>%
	# unnest_tokens(bigram, title, token = "ngrams", n = 2) %>%
	# separate(bigram, c("word1", "word2"), sep = " ") %>%
	# filter(
	# !word1 %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
	# !word2 %in% stopwords_pt$word,
	# !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
	# !str_detect(word2, pattern = "[[:digit:]]"),
	# !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
	# !str_detect(word2, pattern = "[[:punct:]]"),
	# !str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
	# !str_detect(word2, pattern = "(.)\\1{2,}"),
	# !str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
	# !str_detect(word2, pattern = "\\b(.)\\b")
	# ) %>%
	# unite("bigram", c(word1, word2), sep = " ") %>%
	# count(bigram, authorityId) %>%
	# filter(n >= 2) %>%
	# #slice_max(n, n = 100) %>%
	# #filter(!bigram %in% bigram_stop) %>%
	# mutate(bigram = str_to_upper(bigram)) %>%
	# arrange(desc(n)) %>%
	# rename(freq = n, words = bigram)
	# }


	unigrama <- function(d) {
	d %>%
	unnest_tokens(unigram, title, token = "ngrams", n = 1) %>%
	filter(
	!unigram %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
	!str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
	!str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
	!str_detect(unigram, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
	!str_detect(unigram, pattern = "\\b(.)\\b"), # removes any remaining single letter words
	) %>%
	count(unigram, authorityId) %>%
	filter(n >= 6) %>%
	#slice_max(n, n = 100) %>%
	filter(!unigram %in% stopwords_pt) %>%
	mutate(unigram = str_to_upper(unigram)) %>%
	arrange(desc(n)) %>%
	rename(freq = n, words = unigram)
	}

	#################################
	############ GERA O DATA FRAME
	#################################
	a <- d %>%
	group_split(authorityId) %>%
	purrr::map_df(unigrama)
	#purrr::map_df(bigrama)

	# a <- a %>% filter(freq > 15)
	# set.seed(seed = 14412)
	# a$id <- as.vector(sample(1000000000, size = nrow(a), replace = TRUE))
	# a <- a %>% select(id, words, authorityId, freq)

	#####################################
	############ GRAVA NO BANCO DE DADOS
	#####################################

	# DELETE TEMPORARY TABLE (before writing it)
	dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')

	# WRITE TABLE WITH TEMPORARY DATA FRESH FROM THE DB
	dbWriteTable(db, 'wordcloud_agendas_temp', a, row.names=F)

	# INSERE NOVOS DADOS NA TABELA PRINCIPAL
	dbExecute(db, '
	DELETE FROM public.wordcloud_agendas;
	')

	dbExecute(db, '
	INSERT INTO public.wordcloud_agendas (words,"authorityId",freq)
	SELECT "words", "authorityId", "freq"::integer
	FROM public.wordcloud_agendas_temp;
	')

	# DELETE TEMPORARY TABLE (after writing it)
	dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')


	#################################
	############ FECHA DB
	#################################

	dbDisconnect(db)

	rm(list = ls())

	print(paste("Batch Wordcloud encerrado - ", Sys.time() - hours(3)))