Last active
May 12, 2022 23:49
-
-
Save AlbertRapp/37a2e0993acea9b4e36400037b797391 to your computer and use it in GitHub Desktop.
Get Dataviz from Twitter into your note-taking system using R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Change these parts here ---------------------------------------------------- | |
# Set to your working directory where there is a template.md file | |
setwd(here::here()) | |
# Locations | |
vault_location <- stop('Add Vault Location') # Location of markdown files | |
attachments_dir <- stop('Add Vault Subdirectory Location') # subdirectory of vault_location for png-files | |
imap_mail <- stop('Set imap mail client') # mail client | |
### Sensitive Information | |
stop('Set sensitive mail information') | |
bearer_token <- keyring::key_get('twitter-bearer-token') | |
user_mail <- keyring::key_get('dataviz-mail') | |
password_mail <- keyring::key_get('dataviz-mail-password') | |
allowed_senders <- keyring::key_get('allowed-senders') | |
### Dependencies ----------------------------------------------------------- | |
library(purrr) # for pluck and map functions | |
library(stringr) # for regex matching | |
library(readr) # for reading and writing files from/to disk | |
library(tibble) # for easier readable tribble creation | |
library(tidyr) # for unnesting | |
library(dplyr) # for binding rows and pipe | |
library(httr) # for API communication | |
library(mRpostman) # for email communication | |
library(base64enc) # for decoding mails from Android | |
library(rvest) # for decoding mails from Android | |
### Helper functions ----------------------------------------------------------- | |
request_twitter_data <- function(tweet_url, bearer_token) { | |
# Extract tweet id by regex | |
tweet_id <- tweet_url %>% str_match("status/([0-9]+)") %>% .[, 2] | |
auth <- paste("Bearer", bearer_token) # API needs format "Bearer <my_token>" | |
API_url <- 'https://api.twitter.com/2/tweets' | |
# Make request to API and parse to list | |
parsed_request <- GET( | |
API_url, | |
add_headers(Authorization = auth), | |
query = list( | |
ids = tweet_id, | |
tweet.fields = 'created_at', # time stamp | |
expansions='attachments.media_keys,author_id', | |
# necessary expansion fields for img_url | |
media.fields = 'url' # img_url | |
) | |
) %>% content('parsed') | |
# Extract necassary information from list-like structure | |
tweet_text <- parsed_request %>% | |
pluck("data", 1, 'text') | |
tweet_user <- parsed_request %>% | |
pluck("includes", 'users', 1, 'username') | |
# Make file name unique through time-date combination | |
# Replace white spaces and colons (:) for proper file names | |
tweet_date <- parsed_request %>% | |
pluck("data", 1, 'created_at') %>% | |
lubridate::as_datetime() %>% | |
str_replace(' ', '_') %>% | |
str_replace_all(':', '') | |
img_urls <- parsed_request %>% | |
pluck("includes", 'media') %>% | |
bind_rows() %>% | |
filter(type == 'photo') %>% | |
pull(url) | |
# Download image - set mode otherwise download is blurry | |
img_names <- paste('tweet', tweet_user, tweet_date, seq_along(img_urls), sep = "_") | |
walk2(img_urls, img_names, ~download.file(.x, paste0(.y, '.png'), mode = 'wb')) | |
# Return list with information | |
list( | |
url = tweet_url, | |
text = tweet_text, | |
user = tweet_user, | |
file_names = paste0(img_names, '.png'), | |
file_paths = paste0(getwd(), '/', img_names, '.png') | |
) | |
} | |
# Helper function for when a tweet contains multiple images | |
md_import_strings <- function(file_names) { | |
paste0('![[', file_names, ']]', collapse = '\n') | |
} | |
# Replace the placeholders in the template | |
# We change original mail place holder later on | |
replace_template_placeholder <- function(template_name, request) { | |
# Create a dictionary for what to replace in template | |
replace_dict <- tribble( | |
~template, ~replacement, | |
'\\!\\[\\[insert_img_name_here\\]\\]', md_import_strings(request$file_names), | |
'insert_text_here', request$text %>% str_replace_all('#', '(#)'), | |
'insert_URL_here', request$url | |
) | |
# Iteratively apply str_replace_all and keep only final result | |
reduce2( | |
.x = replace_dict$template, | |
.y = replace_dict$replacement, | |
.f = str_replace_all, | |
.init = read_lines(template_name) | |
) %>% | |
paste0(collapse = '\n') # Collaps lines into a single string | |
} | |
# Helper functions to write new text file to current working directory | |
# Returns file path of new file which is necessary to move files later on | |
write_replaced_text <- function(replaced_text, request) { | |
file_name <- request$file_name[1] %>% str_replace('_1.png', '.md') | |
write_lines(replaced_text, file_name) | |
paste0(getwd(), '/', file_name) | |
} | |
move_files <- function(request, replaced_template, vault_location, attachments_dir) { | |
# Create from-to dictionary with file paths in each column | |
move_dict <- tribble( | |
~from, ~to, | |
request$file_path, paste0(vault_location, '/', attachments_dir), | |
replaced_template, vault_location | |
) %>% | |
unnest(cols = 'from') | |
# Copy files from current working directory to destination | |
move_dict %>% pwalk(file.copy, overwrite = T) | |
# Delete files in current working directory | |
walk(move_dict$from, file.remove) | |
} | |
decode_encoded_mails <- function(encoded_mails) { | |
# Ressource: https://stackoverflow.com/questions/71772972/translate-encoding-of-android-mail-in-r | |
# Find location in each encoded string where actual text starts | |
start_encoded <- encoded_mails %>% | |
str_locate_all('base64\r\n\r\n') %>% | |
map(~pluck(., 4) + 1) %>% | |
unlist() | |
# Find location in each encoded string where actual text starts | |
end_encoded <- encoded_mails %>% | |
str_locate_all('----') %>% | |
map(~pluck(., 3) - 1)%>% | |
unlist() | |
# Use str_sub() to extract encoded text | |
encoded_text <- tibble( | |
string = unlist(encoded_mails), | |
start = start_encoded, | |
end = end_encoded | |
) %>% | |
pmap(str_sub) | |
# Decode: base64 -> raw -> char -> html -> text | |
encoded_text %>% | |
map(base64enc::base64decode) %>% | |
map(rawToChar) %>% | |
map(rvest::read_html) %>% | |
map(rvest::html_text2) | |
} | |
decode_all_mails <- function(mail_bodys) { | |
# Decode in case mail is base64 decoded | |
is_encoded <- str_detect(mail_bodys, 'Content-Transfer-Encoding') | |
encoded_mails <- mail_bodys[is_encoded] | |
plain_mails <- mail_bodys[!is_encoded] | |
decoded_mails <- encoded_mails %>% decode_encoded_mails() | |
c(decoded_mails, plain_mails) | |
} | |
### Actual script ----------------------------------------------------------- | |
# Establish connection to imap server | |
con <- configure_imap( | |
url = imap_mail, | |
user = user_mail, | |
password = password_mail | |
) | |
# Switch to Inbox | |
con$select_folder('Inbox') | |
# Extract mails that are from the list of allowed senders | |
mails <- allowed_senders %>% | |
map(~con$search_string(expr = ., where = 'FROM')) %>% | |
unlist() %>% | |
na.omit() %>% # Remove NAs if no mail from a sender | |
as.numeric() # avoids attributes | |
# If a new mail is detected, begin processing | |
if (!is_empty(mails)) { | |
# Grab mail texts and URLs | |
mail_bodys <- mails %>% con$fetch_text() %>% decode_all_mails | |
urls <- mail_bodys %>% str_extract('https.*') | |
# Remove mails from vector in case sth goes wrong and urls cannot be detected | |
mail_bodys <- mail_bodys[!is.na(urls)] | |
mails <- mails[!is.na(urls)] | |
urls <- urls[!is.na(urls)] | |
# For each url request twitter data | |
requests <- map(urls, request_twitter_data, bearer_token = bearer_token) | |
# Use requested twitter data to insert texts into Markdown template and write | |
# to current working directory | |
replaced_templates <- | |
map(requests, replace_template_placeholder, template = 'template.md') %>% | |
map2(mail_bodys, ~str_replace(.x, 'insert_mail_here' ,.y)) %>% | |
map2(requests, ~write_replaced_text(.x, .y)) | |
# Move markdown files and extracted pngs to correct place on HDD | |
walk2( | |
requests, | |
replaced_templates, | |
move_files, | |
vault_location = vault_location, | |
attachments_dir = attachments_dir | |
) | |
# Move emails on imap server to Processed directory | |
con$move_msg(mails, to_folder = 'Processed') | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment