sungiven/wordcloud.r

## wordcloud.r
library(readxl)
library(dplyr)
library(lubridate)
library(ggplot2)
library(sentimentr)
library(cowplot)
library(RColorBrewer)
library(wordcloud)
library(wordcloud2)
library(tidytext)
library(webshot)

webshot::install_phantomjs()

get_wordclouds <- function(file_name) {
  input_file <- paste0("./", file_name, ".xlsx")
  output_file <- paste0("./", file_name, "_wordcloud.png")
  output_file2 <- paste0("./", file_name, "_wordcloud2.png")

  data("stop_words")

  MIN_FREQ <- 1
  WC_SEED <- 1234
  #FILTER_WORD <- 'word to filter'

  # Read XLSX file
  tweets <- read_excel(input_file)

  # text cleaning
  tweets$tweetText <- gsub("https\\S*", "", tweets$tweetText)
  tweets$tweetText <- gsub("@\\S*", "", tweets$tweetText)
  tweets$tweetText <- gsub("amp", "", tweets$tweetText)
  tweets$tweetText <- gsub("[\r\n]", "", tweets$tweetText)
  tweets$tweetText <- gsub("[[:punct:]]", "", tweets$tweetText)
  #tweets$tweetText <- gsub(FILTER_WORD, "", tweets$tweetText, ignore.case = TRUE)

  # Create a dataframe containing each word in col 1 and freq in col 2

  tweets_words <- tweets |>
    filter(!is.na(tweetText)) |>
    mutate(tweetText = gsub("https\\S*|@\\S*|amp|[\r\n]|[[:punct:]]", "", tweetText)) |>
    unnest_tokens(word, tweetText) |>
    anti_join(stop_words, by = "word") |>
    group_by(row_number()) |>
    summarize(word = paste(word, collapse = " ")) |>
    ungroup()

  words <- tweets_words |>
    count(word, sort=TRUE) |>
    filter(n >= MIN_FREQ)

  # generate word-cloud
  set.seed(WC_SEED)

  png(output_file, width = 500, height = 500)
  wordcloud(words=words$word,
            freq=words$n,
            #min.freq = 5,
            max.words=500,
            random.order=FALSE,
            rot.per=0.35,
            colors=brewer.pal(8, "Dark2"),
            scale=c(4,0.4)
  )
  dev.off()

  # Gives a proposed palette
  wc_graph <- wordcloud2(words, size=4.6, color='random-dark')

  # A vector of colors. vector must be same length than input data
  #wordcloud2(words, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )

  # Change the background color
  #wc_graph <- wordcloud2(words, size=1.6, color='random-light', backgroundColor="black")

  # Word orientation
  #wc_graph <- wordcloud2(words, size = 2.3, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)

  # Letter or text as shape
  #wc_graph <- letterCloud( words, word = "W", color='random-light' , backgroundColor="black")
  #letterCloud( words, word = "PEACE", color="white", backgroundColor="pink")

  library("htmlwidgets")
  Sys.setenv(OPENSSL_CONF="/dev/null")

  saveWidget(wc_graph, './github/data-analysis/outputs/tmp.html', selfcontained = F)

  webshot("./github/data-analysis/outputs/tmp.html", output_file2, delay=60, vwidth = 1600, vheight=1200)
}

get_wordclouds("iam_Uchenna")
	library(readxl)
	library(dplyr)
	library(lubridate)
	library(ggplot2)
	library(sentimentr)
	library(cowplot)
	library(RColorBrewer)
	library(wordcloud)
	library(wordcloud2)
	library(tidytext)
	library(webshot)

	webshot::install_phantomjs()

	get_wordclouds <- function(file_name) {
	input_file <- paste0("./", file_name, ".xlsx")
	output_file <- paste0("./", file_name, "_wordcloud.png")
	output_file2 <- paste0("./", file_name, "_wordcloud2.png")

	data("stop_words")

	MIN_FREQ <- 1
	WC_SEED <- 1234
	#FILTER_WORD <- 'word to filter'

	# Read XLSX file
	tweets <- read_excel(input_file)

	# text cleaning
	tweets$tweetText <- gsub("https\\S*", "", tweets$tweetText)
	tweets$tweetText <- gsub("@\\S*", "", tweets$tweetText)
	tweets$tweetText <- gsub("amp", "", tweets$tweetText)
	tweets$tweetText <- gsub("[\r\n]", "", tweets$tweetText)
	tweets$tweetText <- gsub("[[:punct:]]", "", tweets$tweetText)
	#tweets$tweetText <- gsub(FILTER_WORD, "", tweets$tweetText, ignore.case = TRUE)

	# Create a dataframe containing each word in col 1 and freq in col 2

	tweets_words <- tweets \|>
	filter(!is.na(tweetText)) \|>
	mutate(tweetText = gsub("https\\S\|@\\S\|amp\|[\r\n]\|[[:punct:]]", "", tweetText)) \|>
	unnest_tokens(word, tweetText) \|>
	anti_join(stop_words, by = "word") \|>
	group_by(row_number()) \|>
	summarize(word = paste(word, collapse = " ")) \|>
	ungroup()

	words <- tweets_words \|>
	count(word, sort=TRUE) \|>
	filter(n >= MIN_FREQ)

	# generate word-cloud
	set.seed(WC_SEED)

	png(output_file, width = 500, height = 500)
	wordcloud(words=words$word,
	freq=words$n,
	#min.freq = 5,
	max.words=500,
	random.order=FALSE,
	rot.per=0.35,
	colors=brewer.pal(8, "Dark2"),
	scale=c(4,0.4)
	)
	dev.off()

	# Gives a proposed palette
	wc_graph <- wordcloud2(words, size=4.6, color='random-dark')

	# A vector of colors. vector must be same length than input data
	#wordcloud2(words, size=1.6, color=rep_len( c("green","blue"), nrow(demoFreq) ) )

	# Change the background color
	#wc_graph <- wordcloud2(words, size=1.6, color='random-light', backgroundColor="black")

	# Word orientation
	#wc_graph <- wordcloud2(words, size = 2.3, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 1)

	# Letter or text as shape
	#wc_graph <- letterCloud( words, word = "W", color='random-light' , backgroundColor="black")
	#letterCloud( words, word = "PEACE", color="white", backgroundColor="pink")

	library("htmlwidgets")
	Sys.setenv(OPENSSL_CONF="/dev/null")

	saveWidget(wc_graph, './github/data-analysis/outputs/tmp.html', selfcontained = F)

	webshot("./github/data-analysis/outputs/tmp.html", output_file2, delay=60, vwidth = 1600, vheight=1200)
	}

	get_wordclouds("iam_Uchenna")