Extract the full twitter history of all current members of the European Parliament with R (rtweet) #rstats
if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading | |
pacman::p_load("tidyverse") | |
pacman::p_load("ROAuth") | |
pacman::p_load("rtweet") | |
# this assumes authentication has been taken care of as explained here: http://rtweet.info/articles/auth.html | |
# it should still work, but without access tokens the limits for API requests are much slower (it would take *a lot* longer to get all tweets) | |
# it is expected that this script will need to be run more than once | |
# interim files are automatically stored and recovered if the script is re-run | |
# create folders | |
dir.create(path = file.path("TwitterMEP"), showWarnings = FALSE) | |
dir.create(path = file.path("TwitterMEP", "data"), showWarnings = FALSE) | |
## get list of twitter accounts of all MEPs | |
if (file.exists(file.path("TwitterMEP", "data", "MEPsDF.rds"))==FALSE) { | |
MEPsDF <- read_csv(file = "https://raw.githubusercontent.com/eliflab/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>% | |
mutate(SCREEN_NAME = stringr::str_replace_all(string = SCREEN_NAME, pattern = stringr::fixed("@"), replacement = "")) | |
# add a control: to be changed when oldest tweet reached to skip faster to others | |
MEPsDF$OldestReached <- FALSE | |
saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds")) | |
} else { | |
MEPsDF <- read_rds(file.path("TwitterMEP", "data", "MEPsDF.rds")) | |
} | |
# if not first run, load previous results | |
if (file.exists(file.path("TwitterMEP", "data", "allMEPtweets.rds"))) { | |
allMEPtweets <- readRDS(file.path("TwitterMEP", "data", "allMEPtweets.rds")) | |
} else { # otherwise create empty list | |
allMEPtweets <- setNames(vector("list", length(MEPsDF$SCREEN_NAME)), MEPsDF$SCREEN_NAME) | |
} | |
for (i in sample(which(is.na(MEPsDF$SCREEN_NAME)==FALSE))) { # exclude MEP not on Twitter | |
if(MEPsDF$OldestReached[i]==FALSE) { # if the oldest tweet for a MEP has not yet been found, proceed and ask for tweets | |
if (is.null(allMEPtweets[[i]])) { # if no tweet has previous been collected for given MEP, ask for the latest tweets | |
temp <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200), | |
error = function(e) { | |
# Do nothing if error thrown | |
}) | |
} else { # if some tweets already present, load them in memory | |
temp <- allMEPtweets[[i]] | |
} | |
if (is.null(temp)==FALSE) { | |
minId1 <- min(temp$status_id) # find id of oldest tweet | |
if (is.na(minId1)==TRUE){ # suspended accounts throw back a data_frame of NAs | |
minId1 <- 0 | |
} | |
minId2 <- 0 | |
while (minId1 != minId2) { #until oldest tweet is found, keep on asking for previous tweets for a given user | |
temp2 <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200, max_id = minId1), | |
error = function(e) { | |
## Do nothing if error thrown | |
}) | |
if (is.null(temp2)==FALSE) { | |
minId1 <- min(temp$status_id) | |
minId2 <- min(temp2$status_id) | |
if (minId1==minId2) { | |
MEPsDF$OldestReached[i] <- TRUE | |
} else { | |
temp <- bind_rows(temp, temp2) | |
} | |
} else { | |
minId2 <- minId1 # if end of timeline reached, skip to next MEP | |
} | |
Sys.sleep(time = 1) | |
} | |
allMEPtweets[[i]] <- temp %>% distinct() | |
saveRDS(object = allMEPtweets, file = file.path("TwitterMEP", "data", "allMEPtweets.rds")) | |
saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds")) | |
message(paste("New tweets for user", MEPsDF$SCREEN_NAME[i], "stored.")) # inform of progress | |
Sys.sleep(time = 1) | |
} | |
} | |
} | |
# transform into data frame | |
allMEPtweetsDF <- map_df(allMEPtweets, bind_rows) %>% distinct() | |
nrow(allMEPtweetsDF %>% distinct()) | |
# merge with initial data frame to include more details on MEPs | |
allMEPfull <- left_join(allMEPtweetsDF, MEPsDF %>% rename(screen_name = SCREEN_NAME), by = "screen_name") | |
# store the final dataset | |
saveRDS(object = allMEPfull, file = file.path("TwitterMEP", "data", "allMEPfull.rds")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment