mkiang/clean_tweets.R

## clean_tweets.R
## A quick R script for cleaning up your Twitter timeline. This assumes you
## have Twitter API creds but it probably isn't necessary. This is based on
## Chris Albon's Python script:
##      https://gist.github.com/chrisalbon/b9bd4a6309c9f5f5eeab41377f27a670
##
## In this example, I remove old tweets in two stages: (1) if tweets are
## older than two years and have fewer than 100 likes and then (2) if tweets
## are older than 90 days and have fewer than 25. Retweets are removed after
## 90 days. Importantly, tweets (and retweets) that I liked are *always*
## protected from deletion.
##
## USE THIS AT YOUR OWN RISK. I'm not a heavy Twitter user so I am nowhere
## near the API limit (~3000 tweets and likes). I have no idea how this will
## behave for heavy users.

## Imports ----
library(rtweet)
library(igraph)
library(tidyverse)

## Constants ---
## Delete all tweets that are older than MAX_DAYS unless they have at least
## MIN_FAVES likes or are in the protected list.
## I.e., if it's older than three months, it needs to have at least X faves.
MAX_DAYS <- 90
MIN_FAVES <- 25 # >1% of followers seems reasonable

## Delete all tweets that are older than MAX_DAYS_OLD unless they have at least
## MIN_FAVES_OLD likes or are in the protected list.
## I.e., if it's older than two years, it needs to have 100+ likes.
MAX_DAYS_OLD <- 730
MIN_FAVES_OLD <- 100

## Delete retweets older than MAX_DAYS_RETWEETS days unless they are in
## the protected list.
MAX_DAYS_RETWEETS <- 90

## Load API stuff ----
## Fill in API information or put this in a secrets.R file in the root. You
## can also specify tweet_ids that should never be deleted. See
## secrets_example.R
if (file.exists("secrets.R")) {
    source("secrets.R")
} else {
    USER_NAME <- ""
    APP_NAME <- ""
    API_KEY <- ""
    API_SECRET <- ""
    ACCESS_TOKEN <- ""
    ACCESS_SECRET <- ""
    PROTECTED_TWEETS <- ""
}
app_tok <- create_token(
    app = APP_NAME,
    consumer_key = API_KEY,
    consumer_secret = API_SECRET,
    access_token = ACCESS_TOKEN,
    access_secret = ACCESS_SECRET
)

## Helper functions ----
reassign_thread_ids <- function(my_tl) {
    ## For each thread, I want to apply the rules to the entire thread based
    ## on the initiating tweet. E.g., if the first tweet of a thread has 100
    ## likes and all other tweets have 1, I want to all tweets in the thread.
    ## This function takes all (self-)threads and assigns their ID to the
    ## initiating tweet.
    thread_df <- my_tl %>%
        filter(status_id %in% reply_to_status_id |
                   reply_to_status_id %in% status_id,
               substr(text, 1, 1) != "@") %>%
        select(from = status_id, to = reply_to_status_id) %>%
        mutate(to = ifelse(is.na(to), from, to))

    thread_assignments <- thread_df %>%
        graph_from_data_frame(directed = TRUE) %>%
        components()

    id_mapping <- thread_df %>%
        select(status_id = from) %>%
        left_join(tibble(
            status_id = names(thread_assignments$membership),
            membership = thread_assignments$membership
        )) %>%
        group_by(membership) %>%
        mutate(new_status_id = min(status_id)) %>%
        ungroup()

    my_tl %>%
        left_join(id_mapping %>%
                      select(-membership),
                  by = "status_id") %>%
        mutate(new_status_id = ifelse(is.na(new_status_id),
                                      status_id,
                                      new_status_id))
}

subset_columns <- function(my_tl) {
    ## We don't need all ~90 columns so just subset to the most relevant ones.
    my_tl %>%
        select(any_of(
            c(
                "status_id",
                "new_status_id",
                "created_at",
                "screen_name",
                "text",
                "reply_to_status_id",
                "reply_to_user_id",
                "reply_to_screen_name",
                "is_quote",
                "is_retweet",
                "favorite_count",
                "retweet_count",
                "quote_count",
                "reply_count",
                "quoted_status_id",
                "quoted_text",
                "quoted_created_at",
                "retweet_status_id",
                "retweet_text",
                "retweet_created_at"
            )
        )) %>%
        mutate(remove_this = 0)
}

flag_old_lame_tweets <- function(my_tl,
                                 max_days,
                                 min_faves,
                                 remove_cols = TRUE) {
    ## Find all tweets over specified age and under specified fave threshold
    ## and flag them with remove_this == 1, else 0
    x <- my_tl %>%
        mutate(days_diff = as.numeric(difftime(Sys.time(), created_at, units = "days"))) %>%
        mutate(
            remove_this = case_when(
                days_diff >= max_days &
                    favorite_count < min_faves &
                    !is_retweet ~ 1,
                TRUE ~ remove_this
            )
        )

    if (remove_cols) {
        x <- x %>%
            select(-days_diff)
    }

    x
}

flag_old_retweets <- function(my_tl,
                              max_days,
                              remove_cols = TRUE) {
    ## Find all old retweets and remove them regardless of number of faves
    ## (unless they are in the protected tweets list).
    x <- my_tl %>%
        mutate(days_diff = as.numeric(difftime(Sys.time(),
                                               created_at,
                                               units = "days"))) %>%
        mutate(
            remove_this = case_when(
                days_diff >= max_days &
                    is_retweet ~ 1,
                TRUE ~ remove_this
            )
        )

    if (remove_cols) {
        x <- x %>%
            select(-days_diff)
    }

    x
}

unflag_protected_tweets <- function(my_tl, my_faves_vector) {
    ## Given a character vector of tweet IDs, unflag these if they were flagged,
    ## these can be self-liked tweets or a list of protected tweets (or both).
    my_tl %>%
        mutate(
            remove_this = case_when(
                status_id %in% my_faves_vector ~ 0,
                retweet_status_id %in% my_faves_vector ~ 0,
                TRUE ~ remove_this
            )
        )
}

apply_flag_to_threads <- function(my_tl, remove_cols = TRUE) {
    ## If any single tweet in a thread of my own tweets is flagged as "saved",
    ## then save the entire thread.
    x <- my_tl %>%
        group_by(new_status_id) %>%
        mutate(remove_this_weighted = mean(remove_this)) %>%
        mutate(remove_this = ifelse(remove_this_weighted < 1, 0, remove_this)) %>%
        ungroup()

    if (remove_cols) {
        x <- x %>%
            select(-remove_this_weighted)
    }

    x
}

## Get twitter data ----
orig_tl <- get_timeline(USER_NAME, n = 3000, token = app_tok)
my_faves <- get_favorites(USER_NAME, n = 3000, token = app_tok)

## Clean it up ----
my_tl <- orig_tl %>%
    subset_columns %>%
    reassign_thread_ids() %>%
    flag_old_retweets(MAX_DAYS_RETWEETS) %>%
    flag_old_lame_tweets(MAX_DAYS, MIN_FAVES) %>%
    flag_old_lame_tweets(MAX_DAYS_OLD, MIN_FAVES_OLD) %>%
    unflag_protected_tweets(c(my_faves$status_id, PROTECTED_TWEETS)) %>%
    apply_flag_to_threads()

## MAKE SURE YOU'RE OK WITH DELETING THESE!! ----
tweets_to_remove <- my_tl %>%
    filter(remove_this == 1)

View(tweets_to_remove)

## Loop through and delete
## Uncomment this at your own risk.
# for (i in 1:NROW(tweets_to_remove)) {
#     print(sprintf("About to delete: %s.", tweets_to_remove$text[i]))
#     Sys.sleep(5)
#     post_tweet(token = app_tok,
#                destroy_id = tweets_to_remove$status_id[i])
# }

## secrets_example.R
## Secrets ----
USER_NAME <- "mathewkiang"
APP_NAME <- "your_api_name"
API_KEY <- "1234"
API_SECRET <- "3214"
ACCESS_TOKEN <- "3421"
ACCESS_SECRET <- "2134"

PROTECTED_TWEETS <- c(
    "931567726048444416",
    "1229111568295546885"
)
	## A quick R script for cleaning up your Twitter timeline. This assumes you
	## have Twitter API creds but it probably isn't necessary. This is based on
	## Chris Albon's Python script:
	## https://gist.github.com/chrisalbon/b9bd4a6309c9f5f5eeab41377f27a670
	##
	## In this example, I remove old tweets in two stages: (1) if tweets are
	## older than two years and have fewer than 100 likes and then (2) if tweets
	## are older than 90 days and have fewer than 25. Retweets are removed after
	## 90 days. Importantly, tweets (and retweets) that I liked are always
	## protected from deletion.
	##
	## USE THIS AT YOUR OWN RISK. I'm not a heavy Twitter user so I am nowhere
	## near the API limit (~3000 tweets and likes). I have no idea how this will
	## behave for heavy users.

	## Imports ----
	library(rtweet)
	library(igraph)
	library(tidyverse)

	## Constants ---
	## Delete all tweets that are older than MAX_DAYS unless they have at least
	## MIN_FAVES likes or are in the protected list.
	## I.e., if it's older than three months, it needs to have at least X faves.
	MAX_DAYS <- 90
	MIN_FAVES <- 25 # >1% of followers seems reasonable

	## Delete all tweets that are older than MAX_DAYS_OLD unless they have at least
	## MIN_FAVES_OLD likes or are in the protected list.
	## I.e., if it's older than two years, it needs to have 100+ likes.
	MAX_DAYS_OLD <- 730
	MIN_FAVES_OLD <- 100

	## Delete retweets older than MAX_DAYS_RETWEETS days unless they are in
	## the protected list.
	MAX_DAYS_RETWEETS <- 90

	## Load API stuff ----
	## Fill in API information or put this in a secrets.R file in the root. You
	## can also specify tweet_ids that should never be deleted. See
	## secrets_example.R
	if (file.exists("secrets.R")) {
	source("secrets.R")
	} else {
	USER_NAME <- ""
	APP_NAME <- ""
	API_KEY <- ""
	API_SECRET <- ""
	ACCESS_TOKEN <- ""
	ACCESS_SECRET <- ""
	PROTECTED_TWEETS <- ""
	}
	app_tok <- create_token(
	app = APP_NAME,
	consumer_key = API_KEY,
	consumer_secret = API_SECRET,
	access_token = ACCESS_TOKEN,
	access_secret = ACCESS_SECRET
	)

	## Helper functions ----
	reassign_thread_ids <- function(my_tl) {
	## For each thread, I want to apply the rules to the entire thread based
	## on the initiating tweet. E.g., if the first tweet of a thread has 100
	## likes and all other tweets have 1, I want to all tweets in the thread.
	## This function takes all (self-)threads and assigns their ID to the
	## initiating tweet.
	thread_df <- my_tl %>%
	filter(status_id %in% reply_to_status_id \|
	reply_to_status_id %in% status_id,
	substr(text, 1, 1) != "@") %>%
	select(from = status_id, to = reply_to_status_id) %>%
	mutate(to = ifelse(is.na(to), from, to))

	thread_assignments <- thread_df %>%
	graph_from_data_frame(directed = TRUE) %>%
	components()

	id_mapping <- thread_df %>%
	select(status_id = from) %>%
	left_join(tibble(
	status_id = names(thread_assignments$membership),
	membership = thread_assignments$membership
	)) %>%
	group_by(membership) %>%
	mutate(new_status_id = min(status_id)) %>%
	ungroup()

	my_tl %>%
	left_join(id_mapping %>%
	select(-membership),
	by = "status_id") %>%
	mutate(new_status_id = ifelse(is.na(new_status_id),
	status_id,
	new_status_id))
	}

	subset_columns <- function(my_tl) {
	## We don't need all ~90 columns so just subset to the most relevant ones.
	my_tl %>%
	select(any_of(
	c(
	"status_id",
	"new_status_id",
	"created_at",
	"screen_name",
	"text",
	"reply_to_status_id",
	"reply_to_user_id",
	"reply_to_screen_name",
	"is_quote",
	"is_retweet",
	"favorite_count",
	"retweet_count",
	"quote_count",
	"reply_count",
	"quoted_status_id",
	"quoted_text",
	"quoted_created_at",
	"retweet_status_id",
	"retweet_text",
	"retweet_created_at"
	)
	)) %>%
	mutate(remove_this = 0)
	}

	flag_old_lame_tweets <- function(my_tl,
	max_days,
	min_faves,
	remove_cols = TRUE) {
	## Find all tweets over specified age and under specified fave threshold
	## and flag them with remove_this == 1, else 0
	x <- my_tl %>%
	mutate(days_diff = as.numeric(difftime(Sys.time(), created_at, units = "days"))) %>%
	mutate(
	remove_this = case_when(
	days_diff >= max_days &
	favorite_count < min_faves &
	!is_retweet ~ 1,
	TRUE ~ remove_this
	)
	)

	if (remove_cols) {
	x <- x %>%
	select(-days_diff)
	}

	x
	}

	flag_old_retweets <- function(my_tl,
	max_days,
	remove_cols = TRUE) {
	## Find all old retweets and remove them regardless of number of faves
	## (unless they are in the protected tweets list).
	x <- my_tl %>%
	mutate(days_diff = as.numeric(difftime(Sys.time(),
	created_at,
	units = "days"))) %>%
	mutate(
	remove_this = case_when(
	days_diff >= max_days &
	is_retweet ~ 1,
	TRUE ~ remove_this
	)
	)

	if (remove_cols) {
	x <- x %>%
	select(-days_diff)
	}

	x
	}

	unflag_protected_tweets <- function(my_tl, my_faves_vector) {
	## Given a character vector of tweet IDs, unflag these if they were flagged,
	## these can be self-liked tweets or a list of protected tweets (or both).
	my_tl %>%
	mutate(
	remove_this = case_when(
	status_id %in% my_faves_vector ~ 0,
	retweet_status_id %in% my_faves_vector ~ 0,
	TRUE ~ remove_this
	)
	)
	}

	apply_flag_to_threads <- function(my_tl, remove_cols = TRUE) {
	## If any single tweet in a thread of my own tweets is flagged as "saved",
	## then save the entire thread.
	x <- my_tl %>%
	group_by(new_status_id) %>%
	mutate(remove_this_weighted = mean(remove_this)) %>%
	mutate(remove_this = ifelse(remove_this_weighted < 1, 0, remove_this)) %>%
	ungroup()

	if (remove_cols) {
	x <- x %>%
	select(-remove_this_weighted)
	}

	x
	}

	## Get twitter data ----
	orig_tl <- get_timeline(USER_NAME, n = 3000, token = app_tok)
	my_faves <- get_favorites(USER_NAME, n = 3000, token = app_tok)

	## Clean it up ----
	my_tl <- orig_tl %>%
	subset_columns %>%
	reassign_thread_ids() %>%
	flag_old_retweets(MAX_DAYS_RETWEETS) %>%
	flag_old_lame_tweets(MAX_DAYS, MIN_FAVES) %>%
	flag_old_lame_tweets(MAX_DAYS_OLD, MIN_FAVES_OLD) %>%
	unflag_protected_tweets(c(my_faves$status_id, PROTECTED_TWEETS)) %>%
	apply_flag_to_threads()

	## MAKE SURE YOU'RE OK WITH DELETING THESE!! ----
	tweets_to_remove <- my_tl %>%
	filter(remove_this == 1)

	View(tweets_to_remove)

	## Loop through and delete
	## Uncomment this at your own risk.
	# for (i in 1:NROW(tweets_to_remove)) {
	# print(sprintf("About to delete: %s.", tweets_to_remove$text[i]))
	# Sys.sleep(5)
	# post_tweet(token = app_tok,
	# destroy_id = tweets_to_remove$status_id[i])
	# }
	## Secrets ----
	USER_NAME <- "mathewkiang"
	APP_NAME <- "your_api_name"
	API_KEY <- "1234"
	API_SECRET <- "3214"
	ACCESS_TOKEN <- "3421"
	ACCESS_SECRET <- "2134"

	PROTECTED_TWEETS <- c(
	"931567726048444416",
	"1229111568295546885"
	)