Skip to content

Instantly share code, notes, and snippets.

View monogenea's full-sized avatar

Francisco Lima monogenea

View GitHub Profile
#!/Library/Frameworks/R.framework/Resources/Rscript
# Mon Apr 15 18:41:47 2019 ------------------------------
library(rtweet)
# Twitter API
create_token(app = "INSERT_HERE",
consumer_key = "INSERT_HERE",
consumer_secret = "INSERT_HERE",
access_token = "INSERT_HERE",
access_secret = "INSERT_HERE")
# Google Maps API https://developers.google.com/maps/documentation/javascript/get-api-key
apiKey <- "INSERT_HERE"
# Read GOT tweets from US
newTweets <- search_tweets(q = "game of thrones",
retryonratelimit = T, lang = "en",
geocode = lookup_coords("usa", apikey = apiKey),
include_rts = FALSE, n = 1e5) # 1st day 3e5, to go back ~1 week
# Specify dir
dirPath <- "~/Documents/INSERT_PATH"
# Create dir for storage
# Wed May 8 21:22:45 2019 ------------------------------
# Use status_id to identify and exclude duplicates
library(rtweet)
# List all files
allFiles <- paste0("tweets/", list.files("tweets/"))
# Write function to merge tweets
mergeTweets <- function(recipient, donor){
idx <- !donor$status_id %in% recipient$status_id
# Load libraries
library(tidyverse)
library(reshape2)
library(ggplot2)
library(ggridges)
library(lubridate)
library(rtweet)
library(maps)
library(quanteda)
# Convert UTC to EDT
allTweets %<>% dplyr::mutate(created_at = as_datetime(created_at, tz = "UTC")) %>%
dplyr::mutate(created_at = with_tz(created_at, tzone = "America/New_York"))
# Produce lat and lng coordinates
allTweets <- lat_lng(allTweets)
# Plot
par(mar = rep(12, 4))
map("state", lwd = .25)
# plot lat and lng points onto state map
# Tokenize words
tkn <- tokens(allTweets$text,
remove_twitter = T,
remove_separators = T,
remove_symbols = T,
remove_punct = T,
remove_url = T,
remove_hyphens = T,
remove_numbers = T) %>%
tokens_ngrams(n = 1:2)
# Identify tweets containing any of the characters names (0/1)
popularity <- as.data.frame(lapply(gotChars, function(x){
as.integer(sapply(tkn, function(k){any(k %in% x)}))
}))
# Write colnames
colnames(popularity) <- gotChars
# Add column with corresponding EST time
popularity$created_at <- allTweets$created_at
# Sat Oct 5 10:06:01 2019 ------------------------------
# Bonus - rm bots, time-dependend wordclouds & sentiment analysis
rtStats <- do.call("rbind", by(allTweets, INDICES = allTweets$screen_name, function(x){
return(data.frame(num_tweets = nrow(x),
mean_followers = mean(x$followers_count),
median_rt = median(x$retweet_count)))
}))
# Plot log10(num_tweets) vs. log10(median_rt)
with(log10(rtStats+1), plot(num_tweets, median_rt,
# Wordcloud
# Remove potential bots w/ > 100 tweets in the dataset
bots <- rownames(rtStats)[which(rtStats$num_tweets > 100)]
reducedTweet <- allTweets[!allTweets$screen_name %in% bots,]
reducedTweet$text <- texts(reducedTweet$text) %>%
iconv(from = "UTF-8", to = "ASCII", sub = "") %>%
gsub(pattern = "<[A-Z+0-9]+>", repl = " ")
# Tokenize words
tkn <- tokens(reducedTweet$text,