monogenea/10-poissonGoT.R

## 10-poissonGoT.R
# Wordcloud
# Remove potential bots w/ > 100 tweets in the dataset
bots <- rownames(rtStats)[which(rtStats$num_tweets > 100)]
reducedTweet <- allTweets[!allTweets$screen_name %in% bots,]
reducedTweet$text <- texts(reducedTweet$text) %>%
      iconv(from = "UTF-8", to = "ASCII", sub = "") %>%
      gsub(pattern = "<[A-Z+0-9]+>", repl = " ")

# Tokenize words
tkn <- tokens(reducedTweet$text,
              remove_twitter = T,
              remove_separators = T,
              remove_symbols = T,
              remove_punct = T,
              remove_url = T,
              remove_hyphens = T,
              remove_numbers = T)

# Remove stopwords and stem words
gotDfm <- dfm(tkn, tolower = T,
              remove = stopwords("en"),
              stem = T)

# Remove irrelevant terms incl. single-character words
badWords <- c("game", "throne", "gameofthron", "got",
              "watch", "episod", "season", "show",
              "just", "like")
gotDfm <- gotDfm[,nchar(colnames(gotDfm)) > 1 &
                 !colnames(gotDfm) %in% badWords]

epAirTime  <- ymd_hms("2019-04-14 21:00:00", tz = "EST") + dweeks(0:5)
wcLists <- lapply(1:6, function(x){
      idx <- tweetReduced$created_at > epAirTime[x] + dhours(2) &
            tweetReduced$created_at < epAirTime[x] + ddays(4)
      return(gotDfm[idx,])
})

par(mar = rep(0, 4))
for(i in 1:length(wcLists)){
      set.seed(100)
      textplot_wordcloud(wcLists[[i]],
                         max_words = 100)
}
	# Wordcloud
	# Remove potential bots w/ > 100 tweets in the dataset
	bots <- rownames(rtStats)[which(rtStats$num_tweets > 100)]
	reducedTweet <- allTweets[!allTweets$screen_name %in% bots,]
	reducedTweet$text <- texts(reducedTweet$text) %>%
	iconv(from = "UTF-8", to = "ASCII", sub = "") %>%
	gsub(pattern = "<[A-Z+0-9]+>", repl = " ")

	# Tokenize words
	tkn <- tokens(reducedTweet$text,
	remove_twitter = T,
	remove_separators = T,
	remove_symbols = T,
	remove_punct = T,
	remove_url = T,
	remove_hyphens = T,
	remove_numbers = T)

	# Remove stopwords and stem words
	gotDfm <- dfm(tkn, tolower = T,
	remove = stopwords("en"),
	stem = T)

	# Remove irrelevant terms incl. single-character words
	badWords <- c("game", "throne", "gameofthron", "got",
	"watch", "episod", "season", "show",
	"just", "like")
	gotDfm <- gotDfm[,nchar(colnames(gotDfm)) > 1 &
	!colnames(gotDfm) %in% badWords]

	epAirTime <- ymd_hms("2019-04-14 21:00:00", tz = "EST") + dweeks(0:5)
	wcLists <- lapply(1:6, function(x){
	idx <- tweetReduced$created_at > epAirTime[x] + dhours(2) &
	tweetReduced$created_at < epAirTime[x] + ddays(4)
	return(gotDfm[idx,])
	})

	par(mar = rep(0, 4))
	for(i in 1:length(wcLists)){
	set.seed(100)
	textplot_wordcloud(wcLists[[i]],
	max_words = 100)
	}