Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract the full twitter history of all current members of the European Parliament with R (rtweet) #rstats
if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading
pacman::p_load("tidyverse")
pacman::p_load("ROAuth")
pacman::p_load("rtweet")
# this assumes authentication has been taken care of as explained here: http://rtweet.info/articles/auth.html
# it should still work, but without access tokens the limits for API requests are much slower (it would take *a lot* longer to get all tweets)
# it is expected that this script will need to be run more than once
# interim files are automatically stored and recovered if the script is re-run
# create folders
dir.create(path = file.path("TwitterMEP"), showWarnings = FALSE)
dir.create(path = file.path("TwitterMEP", "data"), showWarnings = FALSE)
## get list of twitter accounts of all MEPs
if (file.exists(file.path("TwitterMEP", "data", "MEPsDF.rds"))==FALSE) {
MEPsDF <- read_csv(file = "https://raw.githubusercontent.com/eliflab/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>%
mutate(SCREEN_NAME = stringr::str_replace_all(string = SCREEN_NAME, pattern = stringr::fixed("@"), replacement = ""))
# add a control: to be changed when oldest tweet reached to skip faster to others
MEPsDF$OldestReached <- FALSE
saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds"))
} else {
MEPsDF <- read_rds(file.path("TwitterMEP", "data", "MEPsDF.rds"))
}
# if not first run, load previous results
if (file.exists(file.path("TwitterMEP", "data", "allMEPtweets.rds"))) {
allMEPtweets <- readRDS(file.path("TwitterMEP", "data", "allMEPtweets.rds"))
} else { # otherwise create empty list
allMEPtweets <- setNames(vector("list", length(MEPsDF$SCREEN_NAME)), MEPsDF$SCREEN_NAME)
}
for (i in sample(which(is.na(MEPsDF$SCREEN_NAME)==FALSE))) { # exclude MEP not on Twitter
if(MEPsDF$OldestReached[i]==FALSE) { # if the oldest tweet for a MEP has not yet been found, proceed and ask for tweets
if (is.null(allMEPtweets[[i]])) { # if no tweet has previous been collected for given MEP, ask for the latest tweets
temp <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200),
error = function(e) {
# Do nothing if error thrown
})
} else { # if some tweets already present, load them in memory
temp <- allMEPtweets[[i]]
}
if (is.null(temp)==FALSE) {
minId1 <- min(temp$status_id) # find id of oldest tweet
if (is.na(minId1)==TRUE){ # suspended accounts throw back a data_frame of NAs
minId1 <- 0
}
minId2 <- 0
while (minId1 != minId2) { #until oldest tweet is found, keep on asking for previous tweets for a given user
temp2 <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200, max_id = minId1),
error = function(e) {
## Do nothing if error thrown
})
if (is.null(temp2)==FALSE) {
minId1 <- min(temp$status_id)
minId2 <- min(temp2$status_id)
if (minId1==minId2) {
MEPsDF$OldestReached[i] <- TRUE
} else {
temp <- bind_rows(temp, temp2)
}
} else {
minId2 <- minId1 # if end of timeline reached, skip to next MEP
}
Sys.sleep(time = 1)
}
allMEPtweets[[i]] <- temp %>% distinct()
saveRDS(object = allMEPtweets, file = file.path("TwitterMEP", "data", "allMEPtweets.rds"))
saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds"))
message(paste("New tweets for user", MEPsDF$SCREEN_NAME[i], "stored.")) # inform of progress
Sys.sleep(time = 1)
}
}
}
# transform into data frame
allMEPtweetsDF <- map_df(allMEPtweets, bind_rows) %>% distinct()
nrow(allMEPtweetsDF %>% distinct())
# merge with initial data frame to include more details on MEPs
allMEPfull <- left_join(allMEPtweetsDF, MEPsDF %>% rename(screen_name = SCREEN_NAME), by = "screen_name")
# store the final dataset
saveRDS(object = allMEPfull, file = file.path("TwitterMEP", "data", "allMEPfull.rds"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment