Last active
October 16, 2020 04:02
-
-
Save mkiang/90f961f44487048961fa95951aa55aa3 to your computer and use it in GitHub Desktop.
clean_tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## A quick R script for cleaning up your Twitter timeline. This assumes you | |
## have Twitter API creds but it probably isn't necessary. This is based on | |
## Chris Albon's Python script: | |
## https://gist.github.com/chrisalbon/b9bd4a6309c9f5f5eeab41377f27a670 | |
## | |
## In this example, I remove old tweets in two stages: (1) if tweets are | |
## older than two years and have fewer than 100 likes and then (2) if tweets | |
## are older than 90 days and have fewer than 25. Retweets are removed after | |
## 90 days. Importantly, tweets (and retweets) that I liked are *always* | |
## protected from deletion. | |
## | |
## USE THIS AT YOUR OWN RISK. I'm not a heavy Twitter user so I am nowhere | |
## near the API limit (~3000 tweets and likes). I have no idea how this will | |
## behave for heavy users. | |
## Imports ---- | |
library(rtweet) | |
library(igraph) | |
library(tidyverse) | |
## Constants --- | |
## Delete all tweets that are older than MAX_DAYS unless they have at least | |
## MIN_FAVES likes or are in the protected list. | |
## I.e., if it's older than three months, it needs to have at least X faves. | |
MAX_DAYS <- 90 | |
MIN_FAVES <- 25 # >1% of followers seems reasonable | |
## Delete all tweets that are older than MAX_DAYS_OLD unless they have at least | |
## MIN_FAVES_OLD likes or are in the protected list. | |
## I.e., if it's older than two years, it needs to have 100+ likes. | |
MAX_DAYS_OLD <- 730 | |
MIN_FAVES_OLD <- 100 | |
## Delete retweets older than MAX_DAYS_RETWEETS days unless they are in | |
## the protected list. | |
MAX_DAYS_RETWEETS <- 90 | |
## Load API stuff ---- | |
## Fill in API information or put this in a secrets.R file in the root. You | |
## can also specify tweet_ids that should never be deleted. See | |
## secrets_example.R | |
if (file.exists("secrets.R")) { | |
source("secrets.R") | |
} else { | |
USER_NAME <- "" | |
APP_NAME <- "" | |
API_KEY <- "" | |
API_SECRET <- "" | |
ACCESS_TOKEN <- "" | |
ACCESS_SECRET <- "" | |
PROTECTED_TWEETS <- "" | |
} | |
app_tok <- create_token( | |
app = APP_NAME, | |
consumer_key = API_KEY, | |
consumer_secret = API_SECRET, | |
access_token = ACCESS_TOKEN, | |
access_secret = ACCESS_SECRET | |
) | |
## Helper functions ---- | |
reassign_thread_ids <- function(my_tl) { | |
## For each thread, I want to apply the rules to the entire thread based | |
## on the initiating tweet. E.g., if the first tweet of a thread has 100 | |
## likes and all other tweets have 1, I want to all tweets in the thread. | |
## This function takes all (self-)threads and assigns their ID to the | |
## initiating tweet. | |
thread_df <- my_tl %>% | |
filter(status_id %in% reply_to_status_id | | |
reply_to_status_id %in% status_id, | |
substr(text, 1, 1) != "@") %>% | |
select(from = status_id, to = reply_to_status_id) %>% | |
mutate(to = ifelse(is.na(to), from, to)) | |
thread_assignments <- thread_df %>% | |
graph_from_data_frame(directed = TRUE) %>% | |
components() | |
id_mapping <- thread_df %>% | |
select(status_id = from) %>% | |
left_join(tibble( | |
status_id = names(thread_assignments$membership), | |
membership = thread_assignments$membership | |
)) %>% | |
group_by(membership) %>% | |
mutate(new_status_id = min(status_id)) %>% | |
ungroup() | |
my_tl %>% | |
left_join(id_mapping %>% | |
select(-membership), | |
by = "status_id") %>% | |
mutate(new_status_id = ifelse(is.na(new_status_id), | |
status_id, | |
new_status_id)) | |
} | |
subset_columns <- function(my_tl) { | |
## We don't need all ~90 columns so just subset to the most relevant ones. | |
my_tl %>% | |
select(any_of( | |
c( | |
"status_id", | |
"new_status_id", | |
"created_at", | |
"screen_name", | |
"text", | |
"reply_to_status_id", | |
"reply_to_user_id", | |
"reply_to_screen_name", | |
"is_quote", | |
"is_retweet", | |
"favorite_count", | |
"retweet_count", | |
"quote_count", | |
"reply_count", | |
"quoted_status_id", | |
"quoted_text", | |
"quoted_created_at", | |
"retweet_status_id", | |
"retweet_text", | |
"retweet_created_at" | |
) | |
)) %>% | |
mutate(remove_this = 0) | |
} | |
flag_old_lame_tweets <- function(my_tl, | |
max_days, | |
min_faves, | |
remove_cols = TRUE) { | |
## Find all tweets over specified age and under specified fave threshold | |
## and flag them with remove_this == 1, else 0 | |
x <- my_tl %>% | |
mutate(days_diff = as.numeric(difftime(Sys.time(), created_at, units = "days"))) %>% | |
mutate( | |
remove_this = case_when( | |
days_diff >= max_days & | |
favorite_count < min_faves & | |
!is_retweet ~ 1, | |
TRUE ~ remove_this | |
) | |
) | |
if (remove_cols) { | |
x <- x %>% | |
select(-days_diff) | |
} | |
x | |
} | |
flag_old_retweets <- function(my_tl, | |
max_days, | |
remove_cols = TRUE) { | |
## Find all old retweets and remove them regardless of number of faves | |
## (unless they are in the protected tweets list). | |
x <- my_tl %>% | |
mutate(days_diff = as.numeric(difftime(Sys.time(), | |
created_at, | |
units = "days"))) %>% | |
mutate( | |
remove_this = case_when( | |
days_diff >= max_days & | |
is_retweet ~ 1, | |
TRUE ~ remove_this | |
) | |
) | |
if (remove_cols) { | |
x <- x %>% | |
select(-days_diff) | |
} | |
x | |
} | |
unflag_protected_tweets <- function(my_tl, my_faves_vector) { | |
## Given a character vector of tweet IDs, unflag these if they were flagged, | |
## these can be self-liked tweets or a list of protected tweets (or both). | |
my_tl %>% | |
mutate( | |
remove_this = case_when( | |
status_id %in% my_faves_vector ~ 0, | |
retweet_status_id %in% my_faves_vector ~ 0, | |
TRUE ~ remove_this | |
) | |
) | |
} | |
apply_flag_to_threads <- function(my_tl, remove_cols = TRUE) { | |
## If any single tweet in a thread of my own tweets is flagged as "saved", | |
## then save the entire thread. | |
x <- my_tl %>% | |
group_by(new_status_id) %>% | |
mutate(remove_this_weighted = mean(remove_this)) %>% | |
mutate(remove_this = ifelse(remove_this_weighted < 1, 0, remove_this)) %>% | |
ungroup() | |
if (remove_cols) { | |
x <- x %>% | |
select(-remove_this_weighted) | |
} | |
x | |
} | |
## Get twitter data ---- | |
orig_tl <- get_timeline(USER_NAME, n = 3000, token = app_tok) | |
my_faves <- get_favorites(USER_NAME, n = 3000, token = app_tok) | |
## Clean it up ---- | |
my_tl <- orig_tl %>% | |
subset_columns %>% | |
reassign_thread_ids() %>% | |
flag_old_retweets(MAX_DAYS_RETWEETS) %>% | |
flag_old_lame_tweets(MAX_DAYS, MIN_FAVES) %>% | |
flag_old_lame_tweets(MAX_DAYS_OLD, MIN_FAVES_OLD) %>% | |
unflag_protected_tweets(c(my_faves$status_id, PROTECTED_TWEETS)) %>% | |
apply_flag_to_threads() | |
## MAKE SURE YOU'RE OK WITH DELETING THESE!! ---- | |
tweets_to_remove <- my_tl %>% | |
filter(remove_this == 1) | |
View(tweets_to_remove) | |
## Loop through and delete | |
## Uncomment this at your own risk. | |
# for (i in 1:NROW(tweets_to_remove)) { | |
# print(sprintf("About to delete: %s.", tweets_to_remove$text[i])) | |
# Sys.sleep(5) | |
# post_tweet(token = app_tok, | |
# destroy_id = tweets_to_remove$status_id[i]) | |
# } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Secrets ---- | |
USER_NAME <- "mathewkiang" | |
APP_NAME <- "your_api_name" | |
API_KEY <- "1234" | |
API_SECRET <- "3214" | |
ACCESS_TOKEN <- "3421" | |
ACCESS_SECRET <- "2134" | |
PROTECTED_TWEETS <- c( | |
"931567726048444416", | |
"1229111568295546885" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment