Skip to content

Instantly share code, notes, and snippets.

@mkiang

mkiang/clean_tweets.R

Last active Oct 16, 2020
Embed
What would you like to do?
clean_tweets
## A quick R script for cleaning up your Twitter timeline. This assumes you
## have Twitter API creds but it probably isn't necessary. This is based on
## Chris Albon's Python script:
## https://gist.github.com/chrisalbon/b9bd4a6309c9f5f5eeab41377f27a670
##
## In this example, I remove old tweets in two stages: (1) if tweets are
## older than two years and have fewer than 100 likes and then (2) if tweets
## are older than 90 days and have fewer than 25. Retweets are removed after
## 90 days. Importantly, tweets (and retweets) that I liked are *always*
## protected from deletion.
##
## USE THIS AT YOUR OWN RISK. I'm not a heavy Twitter user so I am nowhere
## near the API limit (~3000 tweets and likes). I have no idea how this will
## behave for heavy users.
## Imports ----
library(rtweet)
library(igraph)
library(tidyverse)
## Constants ---
## Delete all tweets that are older than MAX_DAYS unless they have at least
## MIN_FAVES likes or are in the protected list.
## I.e., if it's older than three months, it needs to have at least X faves.
MAX_DAYS <- 90
MIN_FAVES <- 25 # >1% of followers seems reasonable
## Delete all tweets that are older than MAX_DAYS_OLD unless they have at least
## MIN_FAVES_OLD likes or are in the protected list.
## I.e., if it's older than two years, it needs to have 100+ likes.
MAX_DAYS_OLD <- 730
MIN_FAVES_OLD <- 100
## Delete retweets older than MAX_DAYS_RETWEETS days unless they are in
## the protected list.
MAX_DAYS_RETWEETS <- 90
## Load API stuff ----
## Fill in API information or put this in a secrets.R file in the root. You
## can also specify tweet_ids that should never be deleted. See
## secrets_example.R
if (file.exists("secrets.R")) {
source("secrets.R")
} else {
USER_NAME <- ""
APP_NAME <- ""
API_KEY <- ""
API_SECRET <- ""
ACCESS_TOKEN <- ""
ACCESS_SECRET <- ""
PROTECTED_TWEETS <- ""
}
app_tok <- create_token(
app = APP_NAME,
consumer_key = API_KEY,
consumer_secret = API_SECRET,
access_token = ACCESS_TOKEN,
access_secret = ACCESS_SECRET
)
## Helper functions ----
reassign_thread_ids <- function(my_tl) {
## For each thread, I want to apply the rules to the entire thread based
## on the initiating tweet. E.g., if the first tweet of a thread has 100
## likes and all other tweets have 1, I want to all tweets in the thread.
## This function takes all (self-)threads and assigns their ID to the
## initiating tweet.
thread_df <- my_tl %>%
filter(status_id %in% reply_to_status_id |
reply_to_status_id %in% status_id,
substr(text, 1, 1) != "@") %>%
select(from = status_id, to = reply_to_status_id) %>%
mutate(to = ifelse(is.na(to), from, to))
thread_assignments <- thread_df %>%
graph_from_data_frame(directed = TRUE) %>%
components()
id_mapping <- thread_df %>%
select(status_id = from) %>%
left_join(tibble(
status_id = names(thread_assignments$membership),
membership = thread_assignments$membership
)) %>%
group_by(membership) %>%
mutate(new_status_id = min(status_id)) %>%
ungroup()
my_tl %>%
left_join(id_mapping %>%
select(-membership),
by = "status_id") %>%
mutate(new_status_id = ifelse(is.na(new_status_id),
status_id,
new_status_id))
}
subset_columns <- function(my_tl) {
## We don't need all ~90 columns so just subset to the most relevant ones.
my_tl %>%
select(any_of(
c(
"status_id",
"new_status_id",
"created_at",
"screen_name",
"text",
"reply_to_status_id",
"reply_to_user_id",
"reply_to_screen_name",
"is_quote",
"is_retweet",
"favorite_count",
"retweet_count",
"quote_count",
"reply_count",
"quoted_status_id",
"quoted_text",
"quoted_created_at",
"retweet_status_id",
"retweet_text",
"retweet_created_at"
)
)) %>%
mutate(remove_this = 0)
}
flag_old_lame_tweets <- function(my_tl,
max_days,
min_faves,
remove_cols = TRUE) {
## Find all tweets over specified age and under specified fave threshold
## and flag them with remove_this == 1, else 0
x <- my_tl %>%
mutate(days_diff = as.numeric(difftime(Sys.time(), created_at, units = "days"))) %>%
mutate(
remove_this = case_when(
days_diff >= max_days &
favorite_count < min_faves &
!is_retweet ~ 1,
TRUE ~ remove_this
)
)
if (remove_cols) {
x <- x %>%
select(-days_diff)
}
x
}
flag_old_retweets <- function(my_tl,
max_days,
remove_cols = TRUE) {
## Find all old retweets and remove them regardless of number of faves
## (unless they are in the protected tweets list).
x <- my_tl %>%
mutate(days_diff = as.numeric(difftime(Sys.time(),
created_at,
units = "days"))) %>%
mutate(
remove_this = case_when(
days_diff >= max_days &
is_retweet ~ 1,
TRUE ~ remove_this
)
)
if (remove_cols) {
x <- x %>%
select(-days_diff)
}
x
}
unflag_protected_tweets <- function(my_tl, my_faves_vector) {
## Given a character vector of tweet IDs, unflag these if they were flagged,
## these can be self-liked tweets or a list of protected tweets (or both).
my_tl %>%
mutate(
remove_this = case_when(
status_id %in% my_faves_vector ~ 0,
retweet_status_id %in% my_faves_vector ~ 0,
TRUE ~ remove_this
)
)
}
apply_flag_to_threads <- function(my_tl, remove_cols = TRUE) {
## If any single tweet in a thread of my own tweets is flagged as "saved",
## then save the entire thread.
x <- my_tl %>%
group_by(new_status_id) %>%
mutate(remove_this_weighted = mean(remove_this)) %>%
mutate(remove_this = ifelse(remove_this_weighted < 1, 0, remove_this)) %>%
ungroup()
if (remove_cols) {
x <- x %>%
select(-remove_this_weighted)
}
x
}
## Get twitter data ----
orig_tl <- get_timeline(USER_NAME, n = 3000, token = app_tok)
my_faves <- get_favorites(USER_NAME, n = 3000, token = app_tok)
## Clean it up ----
my_tl <- orig_tl %>%
subset_columns %>%
reassign_thread_ids() %>%
flag_old_retweets(MAX_DAYS_RETWEETS) %>%
flag_old_lame_tweets(MAX_DAYS, MIN_FAVES) %>%
flag_old_lame_tweets(MAX_DAYS_OLD, MIN_FAVES_OLD) %>%
unflag_protected_tweets(c(my_faves$status_id, PROTECTED_TWEETS)) %>%
apply_flag_to_threads()
## MAKE SURE YOU'RE OK WITH DELETING THESE!! ----
tweets_to_remove <- my_tl %>%
filter(remove_this == 1)
View(tweets_to_remove)
## Loop through and delete
## Uncomment this at your own risk.
# for (i in 1:NROW(tweets_to_remove)) {
# print(sprintf("About to delete: %s.", tweets_to_remove$text[i]))
# Sys.sleep(5)
# post_tweet(token = app_tok,
# destroy_id = tweets_to_remove$status_id[i])
# }
## Secrets ----
USER_NAME <- "mathewkiang"
APP_NAME <- "your_api_name"
API_KEY <- "1234"
API_SECRET <- "3214"
ACCESS_TOKEN <- "3421"
ACCESS_SECRET <- "2134"
PROTECTED_TWEETS <- c(
"931567726048444416",
"1229111568295546885"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.