clean_tweets
## A quick R script for cleaning up your Twitter timeline. This assumes you | |
## have Twitter API creds but it probably isn't necessary. This is based on | |
## Chris Albon's Python script: | |
## https://gist.github.com/chrisalbon/b9bd4a6309c9f5f5eeab41377f27a670 | |
## | |
## In this example, I remove old tweets in two stages: (1) if tweets are | |
## older than two years and have fewer than 100 likes and then (2) if tweets | |
## are older than 90 days and have fewer than 25. Retweets are removed after | |
## 90 days. Importantly, tweets (and retweets) that I liked are *always* | |
## protected from deletion. | |
## | |
## USE THIS AT YOUR OWN RISK. I'm not a heavy Twitter user so I am nowhere | |
## near the API limit (~3000 tweets and likes). I have no idea how this will | |
## behave for heavy users. | |
## Imports ---- | |
library(rtweet) | |
library(igraph) | |
library(tidyverse) | |
## Constants --- | |
## Delete all tweets that are older than MAX_DAYS unless they have at least | |
## MIN_FAVES likes or are in the protected list. | |
## I.e., if it's older than three months, it needs to have at least X faves. | |
MAX_DAYS <- 90 | |
MIN_FAVES <- 25 # >1% of followers seems reasonable | |
## Delete all tweets that are older than MAX_DAYS_OLD unless they have at least | |
## MIN_FAVES_OLD likes or are in the protected list. | |
## I.e., if it's older than two years, it needs to have 100+ likes. | |
MAX_DAYS_OLD <- 730 | |
MIN_FAVES_OLD <- 100 | |
## Delete retweets older than MAX_DAYS_RETWEETS days unless they are in | |
## the protected list. | |
MAX_DAYS_RETWEETS <- 90 | |
## Load API stuff ---- | |
## Fill in API information or put this in a secrets.R file in the root. You | |
## can also specify tweet_ids that should never be deleted. See | |
## secrets_example.R | |
if (file.exists("secrets.R")) { | |
source("secrets.R") | |
} else { | |
USER_NAME <- "" | |
APP_NAME <- "" | |
API_KEY <- "" | |
API_SECRET <- "" | |
ACCESS_TOKEN <- "" | |
ACCESS_SECRET <- "" | |
PROTECTED_TWEETS <- "" | |
} | |
app_tok <- create_token( | |
app = APP_NAME, | |
consumer_key = API_KEY, | |
consumer_secret = API_SECRET, | |
access_token = ACCESS_TOKEN, | |
access_secret = ACCESS_SECRET | |
) | |
## Helper functions ---- | |
reassign_thread_ids <- function(my_tl) { | |
## For each thread, I want to apply the rules to the entire thread based | |
## on the initiating tweet. E.g., if the first tweet of a thread has 100 | |
## likes and all other tweets have 1, I want to all tweets in the thread. | |
## This function takes all (self-)threads and assigns their ID to the | |
## initiating tweet. | |
thread_df <- my_tl %>% | |
filter(status_id %in% reply_to_status_id | | |
reply_to_status_id %in% status_id, | |
substr(text, 1, 1) != "@") %>% | |
select(from = status_id, to = reply_to_status_id) %>% | |
mutate(to = ifelse(is.na(to), from, to)) | |
thread_assignments <- thread_df %>% | |
graph_from_data_frame(directed = TRUE) %>% | |
components() | |
id_mapping <- thread_df %>% | |
select(status_id = from) %>% | |
left_join(tibble( | |
status_id = names(thread_assignments$membership), | |
membership = thread_assignments$membership | |
)) %>% | |
group_by(membership) %>% | |
mutate(new_status_id = min(status_id)) %>% | |
ungroup() | |
my_tl %>% | |
left_join(id_mapping %>% | |
select(-membership), | |
by = "status_id") %>% | |
mutate(new_status_id = ifelse(is.na(new_status_id), | |
status_id, | |
new_status_id)) | |
} | |
subset_columns <- function(my_tl) { | |
## We don't need all ~90 columns so just subset to the most relevant ones. | |
my_tl %>% | |
select(any_of( | |
c( | |
"status_id", | |
"new_status_id", | |
"created_at", | |
"screen_name", | |
"text", | |
"reply_to_status_id", | |
"reply_to_user_id", | |
"reply_to_screen_name", | |
"is_quote", | |
"is_retweet", | |
"favorite_count", | |
"retweet_count", | |
"quote_count", | |
"reply_count", | |
"quoted_status_id", | |
"quoted_text", | |
"quoted_created_at", | |
"retweet_status_id", | |
"retweet_text", | |
"retweet_created_at" | |
) | |
)) %>% | |
mutate(remove_this = 0) | |
} | |
flag_old_lame_tweets <- function(my_tl, | |
max_days, | |
min_faves, | |
remove_cols = TRUE) { | |
## Find all tweets over specified age and under specified fave threshold | |
## and flag them with remove_this == 1, else 0 | |
x <- my_tl %>% | |
mutate(days_diff = as.numeric(difftime(Sys.time(), created_at, units = "days"))) %>% | |
mutate( | |
remove_this = case_when( | |
days_diff >= max_days & | |
favorite_count < min_faves & | |
!is_retweet ~ 1, | |
TRUE ~ remove_this | |
) | |
) | |
if (remove_cols) { | |
x <- x %>% | |
select(-days_diff) | |
} | |
x | |
} | |
flag_old_retweets <- function(my_tl, | |
max_days, | |
remove_cols = TRUE) { | |
## Find all old retweets and remove them regardless of number of faves | |
## (unless they are in the protected tweets list). | |
x <- my_tl %>% | |
mutate(days_diff = as.numeric(difftime(Sys.time(), | |
created_at, | |
units = "days"))) %>% | |
mutate( | |
remove_this = case_when( | |
days_diff >= max_days & | |
is_retweet ~ 1, | |
TRUE ~ remove_this | |
) | |
) | |
if (remove_cols) { | |
x <- x %>% | |
select(-days_diff) | |
} | |
x | |
} | |
unflag_protected_tweets <- function(my_tl, my_faves_vector) { | |
## Given a character vector of tweet IDs, unflag these if they were flagged, | |
## these can be self-liked tweets or a list of protected tweets (or both). | |
my_tl %>% | |
mutate( | |
remove_this = case_when( | |
status_id %in% my_faves_vector ~ 0, | |
retweet_status_id %in% my_faves_vector ~ 0, | |
TRUE ~ remove_this | |
) | |
) | |
} | |
apply_flag_to_threads <- function(my_tl, remove_cols = TRUE) { | |
## If any single tweet in a thread of my own tweets is flagged as "saved", | |
## then save the entire thread. | |
x <- my_tl %>% | |
group_by(new_status_id) %>% | |
mutate(remove_this_weighted = mean(remove_this)) %>% | |
mutate(remove_this = ifelse(remove_this_weighted < 1, 0, remove_this)) %>% | |
ungroup() | |
if (remove_cols) { | |
x <- x %>% | |
select(-remove_this_weighted) | |
} | |
x | |
} | |
## Get twitter data ---- | |
orig_tl <- get_timeline(USER_NAME, n = 3000, token = app_tok) | |
my_faves <- get_favorites(USER_NAME, n = 3000, token = app_tok) | |
## Clean it up ---- | |
my_tl <- orig_tl %>% | |
subset_columns %>% | |
reassign_thread_ids() %>% | |
flag_old_retweets(MAX_DAYS_RETWEETS) %>% | |
flag_old_lame_tweets(MAX_DAYS, MIN_FAVES) %>% | |
flag_old_lame_tweets(MAX_DAYS_OLD, MIN_FAVES_OLD) %>% | |
unflag_protected_tweets(c(my_faves$status_id, PROTECTED_TWEETS)) %>% | |
apply_flag_to_threads() | |
## MAKE SURE YOU'RE OK WITH DELETING THESE!! ---- | |
tweets_to_remove <- my_tl %>% | |
filter(remove_this == 1) | |
View(tweets_to_remove) | |
## Loop through and delete | |
## Uncomment this at your own risk. | |
# for (i in 1:NROW(tweets_to_remove)) { | |
# print(sprintf("About to delete: %s.", tweets_to_remove$text[i])) | |
# Sys.sleep(5) | |
# post_tweet(token = app_tok, | |
# destroy_id = tweets_to_remove$status_id[i]) | |
# } |
## Secrets ---- | |
USER_NAME <- "mathewkiang" | |
APP_NAME <- "your_api_name" | |
API_KEY <- "1234" | |
API_SECRET <- "3214" | |
ACCESS_TOKEN <- "3421" | |
ACCESS_SECRET <- "2134" | |
PROTECTED_TWEETS <- c( | |
"931567726048444416", | |
"1229111568295546885" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment