Last active
July 30, 2020 11:18
-
-
Save Arf9999/9f23830fde3fa5c254b8f2745398628b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##function to rehydrate GetOldTweets3 tweet data using rtweet & standard API | |
##NB: A twitter auth token or alternatively a list of tokens that are loaded into the environment *must* be specified | |
rehydrate_got3_statuses <- function(got3_df,token = NULL, token_list = NULL ) { | |
require(rtweet, quietly = TRUE) | |
require(dplyr, quietly = TRUE) | |
require(purrr, quietly = TRUE) | |
require(readr, quietly = TRUE) | |
### Check tokens | |
if (is.null(token) & is.null(token_list)){ | |
message("Please designate either a token or a list of tokens that are loaded into the environment") | |
break() | |
} | |
if (is.null(token_list)) { | |
token_list = c(token) | |
} | |
### | |
df_length <- nrow(got3_df) ##check length of GOT list of statuses | |
last_capture <- 0L ##initial setting for statuses captured | |
#check ratelimits for all tokens | |
ratelimits <- | |
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses") | |
if (as.numeric(max(ratelimits$remaining)) > df_length / 100) { | |
##if no ratelimit reset or token rotation required. | |
message(paste0("lookup of ",df_length, " statuses" )) | |
row_of_max_rl <- which.max(ratelimits$remaining) | |
rehydration <- rtweet::lookup_statuses(unlist(got3_df[, "id"]), | |
token = token_list[[row_of_max_rl]]) | |
} else{ | |
while (last_capture < nrow(got3_df)) { | |
# iterate through the rows using token rotation & rate reset pausing | |
row_of_max_rl <- which.max(ratelimits$remaining) | |
##first iteration | |
if (last_capture == 0L) { | |
last_capture <- as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100) | |
message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"]))) | |
message(paste0("base: ", 1L," upper: ", last_capture)) | |
rehydration <- | |
rtweet::lookup_statuses(unlist(got3_df[c(1:last_capture), "id"]), | |
token = token_list[[row_of_max_rl]]) | |
} else{ | |
#iterate through remainder of status_ids | |
base_capture <- last_capture + 1 | |
last_capture <- min(c(base_capture + as.numeric(ratelimits[row_of_max_rl, "remaining"] * 100), | |
nrow(got3_df))) | |
message(paste0("rl requests remain : ", as.numeric(ratelimits[row_of_max_rl, "remaining"]))) | |
message(paste0("base: ", base_capture," upper: ", last_capture)) | |
rehydration <- dplyr::bind_rows(rehydration, | |
rtweet::lookup_statuses(unlist(got3_df[c(base_capture:last_capture), "id"]), | |
token = token_list[[row_of_max_rl]])) | |
} | |
ratelimits <- | |
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses")#check ratelimits | |
###manage ratelimit resets as gracefully as possible - conservatively set to 100 queries as test | |
if (max(ratelimits$remaining) < 100 & last_capture < nrow(got3_df)) { | |
message(paste0( | |
"Pausing for ratelimit reset: ", | |
min(ratelimits$reset), | |
" minutes" | |
)) | |
Sys.sleep(as.numeric(min(ratelimits$reset) * 60)) | |
ratelimits <- | |
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses") | |
} | |
} | |
} | |
##check for missing tweets and re-lookup | |
orig <- dplyr::as_tibble(got3_df[,"id"]) %>% | |
rename(status_id = id) | |
message(paste0("original: ",nrow(orig))) | |
missing <- anti_join(orig, as_tibble(rehydration[,"status_id"]), by = "status_id") | |
message(paste0("missing: ", nrow(missing))) | |
if(nrow(missing) > 0){ | |
##try again to look up missing statuses | |
ratelimits <- | |
purrr::map_df(token_list, rtweet::rate_limit, query = "lookup_statuses") | |
row_of_max_rl <- which.max(ratelimits$remaining) | |
df_length <- nrow(missing) | |
message(paste0("Attempting to populate missing tweets: ", df_length)) | |
rehydration <- bind_rows(rehydration, | |
rtweet::lookup_statuses(unlist(missing[c(1:df_length), "status_id"]), | |
token = token_list[[row_of_max_rl]])) | |
##write log file of missing tweets. | |
missing <- anti_join(orig, as_tibble(rehydration[,"status_id"]), by = "status_id") %>% | |
rename(id = status_id)%>% | |
left_join(got3_df, by = "id")%>% | |
mutate(error = "Status not downloaded") | |
message(paste0(nrow(missing), | |
" tweets not downloaded, see log file for details: ", | |
"got3_rehydration_missing_log_", | |
as.numeric(Sys.time()), ".csv")) | |
readr::write_csv(missing, paste0("got3_rehydration_missing_log_", | |
as.numeric(Sys.time()), ".csv")) | |
} | |
return(rehydration) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment