AlbertRapp/twitter_tracking.R

## twitter_tracking.R

### Change these parts here ----------------------------------------------------

# Set to your working directory where there is a template.md file
setwd(here::here())
# Locations
vault_location <- stop('Add Vault Location') # Location of markdown files
attachments_dir <- stop('Add Vault Subdirectory Location') # subdirectory of vault_location for png-files
imap_mail <- stop('Set imap mail client') # mail client


### Sensitive Information
stop('Set sensitive mail information')
bearer_token <- keyring::key_get('twitter-bearer-token')
user_mail <- keyring::key_get('dataviz-mail')
password_mail <- keyring::key_get('dataviz-mail-password')
allowed_senders <- keyring::key_get('allowed-senders')

### Dependencies -----------------------------------------------------------

library(purrr) # for pluck and map functions
library(stringr) # for regex matching
library(readr) # for reading and writing files from/to disk
library(tibble) # for easier readable tribble creation
library(tidyr) # for unnesting
library(dplyr) # for binding rows and pipe
library(httr) # for API communication
library(mRpostman) # for email communication
library(base64enc) # for decoding mails from Android
library(rvest) # for decoding mails from Android

### Helper functions -----------------------------------------------------------
request_twitter_data <- function(tweet_url, bearer_token) {
  # Extract tweet id by regex
  tweet_id <- tweet_url %>% str_match("status/([0-9]+)") %>% .[, 2]
  auth <- paste("Bearer", bearer_token) # API needs format "Bearer <my_token>"
  API_url <- 'https://api.twitter.com/2/tweets'

  # Make request to API and parse to list
  parsed_request <- GET(
    API_url,
    add_headers(Authorization = auth),
    query = list(
      ids = tweet_id,
      tweet.fields = 'created_at', # time stamp
      expansions='attachments.media_keys,author_id',
      # necessary expansion fields for img_url
      media.fields = 'url' # img_url
    )
  ) %>% content('parsed')

  # Extract necassary information from list-like structure
  tweet_text <- parsed_request %>%
    pluck("data", 1, 'text')

  tweet_user <-  parsed_request %>%
    pluck("includes", 'users', 1, 'username')

  # Make file name unique through time-date combination
  # Replace white spaces and colons (:) for proper file names
  tweet_date <- parsed_request %>%
    pluck("data", 1, 'created_at') %>%
    lubridate::as_datetime() %>%
    str_replace(' ', '_') %>%
    str_replace_all(':', '')

  img_urls <- parsed_request %>%
    pluck("includes", 'media') %>%
    bind_rows() %>%
    filter(type == 'photo') %>%
    pull(url)

  # Download image - set mode otherwise download is blurry
  img_names <- paste('tweet', tweet_user, tweet_date, seq_along(img_urls), sep = "_")
  walk2(img_urls, img_names, ~download.file(.x, paste0(.y, '.png'), mode = 'wb'))

  # Return list with information
  list(
    url = tweet_url,
    text = tweet_text,
    user = tweet_user,
    file_names = paste0(img_names, '.png'),
    file_paths = paste0(getwd(), '/', img_names, '.png')
  )
}


# Helper function for when a tweet contains multiple images
md_import_strings <- function(file_names) {
  paste0('![[', file_names, ']]', collapse = '\n')
}

# Replace the placeholders in the template
# We change original mail place holder later on
replace_template_placeholder <- function(template_name, request) {
  # Create a dictionary for what to replace in template
  replace_dict <- tribble(
    ~template, ~replacement,
    '\\!\\[\\[insert_img_name_here\\]\\]', md_import_strings(request$file_names),
    'insert_text_here', request$text %>% str_replace_all('#', '(#)'),
    'insert_URL_here', request$url
  )

  # Iteratively apply str_replace_all and keep only final result
  reduce2(
    .x = replace_dict$template,
    .y = replace_dict$replacement,
    .f = str_replace_all,
    .init =  read_lines(template_name)
  ) %>%
    paste0(collapse = '\n') # Collaps lines into a single string
}

# Helper functions to write new text file to current working directory
# Returns file path of new file which is necessary to move files later on
write_replaced_text <- function(replaced_text, request) {
  file_name <- request$file_name[1] %>% str_replace('_1.png', '.md')
  write_lines(replaced_text, file_name)
  paste0(getwd(), '/', file_name)
}


move_files <- function(request, replaced_template, vault_location, attachments_dir) {
  # Create from-to dictionary with file paths in each column
  move_dict <- tribble(
    ~from, ~to,
    request$file_path, paste0(vault_location, '/', attachments_dir),
    replaced_template, vault_location
  ) %>%
    unnest(cols = 'from')
  # Copy files from current working directory to destination
  move_dict %>% pwalk(file.copy, overwrite = T)
  # Delete files in current working directory
  walk(move_dict$from, file.remove)
}

decode_encoded_mails <- function(encoded_mails) {
  # Ressource: https://stackoverflow.com/questions/71772972/translate-encoding-of-android-mail-in-r
  # Find location in each encoded string where actual text starts
  start_encoded <- encoded_mails %>%
    str_locate_all('base64\r\n\r\n') %>%
    map(~pluck(., 4) + 1) %>%
    unlist()

  # Find location in each encoded string where actual text starts
  end_encoded <- encoded_mails %>%
    str_locate_all('----') %>%
    map(~pluck(., 3) - 1)%>%
    unlist()

  # Use str_sub() to extract encoded text
  encoded_text <- tibble(
    string = unlist(encoded_mails),
    start = start_encoded,
    end = end_encoded
  ) %>%
    pmap(str_sub)

  # Decode: base64 -> raw -> char -> html -> text
  encoded_text %>%
    map(base64enc::base64decode) %>%
    map(rawToChar) %>%
    map(rvest::read_html) %>%
    map(rvest::html_text2)
}

decode_all_mails <- function(mail_bodys) {
  # Decode in case mail is base64 decoded
  is_encoded <- str_detect(mail_bodys, 'Content-Transfer-Encoding')
  encoded_mails <- mail_bodys[is_encoded]
  plain_mails <- mail_bodys[!is_encoded]
  decoded_mails <- encoded_mails %>% decode_encoded_mails()
  c(decoded_mails, plain_mails)

}

### Actual script -----------------------------------------------------------

# Establish connection to imap server
con <- configure_imap(
  url = imap_mail,
  user = user_mail,
  password = password_mail
)

# Switch to Inbox
con$select_folder('Inbox')

# Extract mails that are from the list of allowed senders
mails <- allowed_senders %>%
  map(~con$search_string(expr = ., where = 'FROM')) %>%
  unlist() %>%
  na.omit() %>% # Remove NAs if no mail from a sender
  as.numeric() # avoids attributes

# If a new mail is detected, begin processing
if (!is_empty(mails)) {
  # Grab mail texts and URLs
  mail_bodys <- mails %>% con$fetch_text() %>% decode_all_mails
  urls <- mail_bodys %>% str_extract('https.*')

  # Remove mails from vector in case sth goes wrong and urls cannot be detected
  mail_bodys <- mail_bodys[!is.na(urls)]
  mails <- mails[!is.na(urls)]
  urls <- urls[!is.na(urls)]

  # For each url request twitter data
  requests <- map(urls, request_twitter_data, bearer_token = bearer_token)

  # Use requested twitter data to insert texts into Markdown template and write
  # to current working directory
  replaced_templates <-
    map(requests, replace_template_placeholder, template = 'template.md') %>%
    map2(mail_bodys, ~str_replace(.x, 'insert_mail_here' ,.y)) %>%
    map2(requests, ~write_replaced_text(.x, .y))

  # Move markdown files and extracted pngs to correct place on HDD
  walk2(
    requests,
    replaced_templates,
    move_files,
    vault_location = vault_location,
    attachments_dir = attachments_dir
  )

  # Move emails on imap server to Processed directory
  con$move_msg(mails, to_folder = 'Processed')
}

	### Change these parts here ----------------------------------------------------

	# Set to your working directory where there is a template.md file
	setwd(here::here())
	# Locations
	vault_location <- stop('Add Vault Location') # Location of markdown files
	attachments_dir <- stop('Add Vault Subdirectory Location') # subdirectory of vault_location for png-files
	imap_mail <- stop('Set imap mail client') # mail client


	### Sensitive Information
	stop('Set sensitive mail information')
	bearer_token <- keyring::key_get('twitter-bearer-token')
	user_mail <- keyring::key_get('dataviz-mail')
	password_mail <- keyring::key_get('dataviz-mail-password')
	allowed_senders <- keyring::key_get('allowed-senders')

	### Dependencies -----------------------------------------------------------

	library(purrr) # for pluck and map functions
	library(stringr) # for regex matching
	library(readr) # for reading and writing files from/to disk
	library(tibble) # for easier readable tribble creation
	library(tidyr) # for unnesting
	library(dplyr) # for binding rows and pipe
	library(httr) # for API communication
	library(mRpostman) # for email communication
	library(base64enc) # for decoding mails from Android
	library(rvest) # for decoding mails from Android

	### Helper functions -----------------------------------------------------------
	request_twitter_data <- function(tweet_url, bearer_token) {
	# Extract tweet id by regex
	tweet_id <- tweet_url %>% str_match("status/([0-9]+)") %>% .[, 2]
	auth <- paste("Bearer", bearer_token) # API needs format "Bearer <my_token>"
	API_url <- 'https://api.twitter.com/2/tweets'

	# Make request to API and parse to list
	parsed_request <- GET(
	API_url,
	add_headers(Authorization = auth),
	query = list(
	ids = tweet_id,
	tweet.fields = 'created_at', # time stamp
	expansions='attachments.media_keys,author_id',
	# necessary expansion fields for img_url
	media.fields = 'url' # img_url
	)
	) %>% content('parsed')

	# Extract necassary information from list-like structure
	tweet_text <- parsed_request %>%
	pluck("data", 1, 'text')

	tweet_user <- parsed_request %>%
	pluck("includes", 'users', 1, 'username')

	# Make file name unique through time-date combination
	# Replace white spaces and colons (:) for proper file names
	tweet_date <- parsed_request %>%
	pluck("data", 1, 'created_at') %>%
	lubridate::as_datetime() %>%
	str_replace(' ', '_') %>%
	str_replace_all(':', '')

	img_urls <- parsed_request %>%
	pluck("includes", 'media') %>%
	bind_rows() %>%
	filter(type == 'photo') %>%
	pull(url)

	# Download image - set mode otherwise download is blurry
	img_names <- paste('tweet', tweet_user, tweet_date, seq_along(img_urls), sep = "_")
	walk2(img_urls, img_names, ~download.file(.x, paste0(.y, '.png'), mode = 'wb'))

	# Return list with information
	list(
	url = tweet_url,
	text = tweet_text,
	user = tweet_user,
	file_names = paste0(img_names, '.png'),
	file_paths = paste0(getwd(), '/', img_names, '.png')
	)
	}


	# Helper function for when a tweet contains multiple images
	md_import_strings <- function(file_names) {
	paste0('![[', file_names, ']]', collapse = '\n')
	}

	# Replace the placeholders in the template
	# We change original mail place holder later on
	replace_template_placeholder <- function(template_name, request) {
	# Create a dictionary for what to replace in template
	replace_dict <- tribble(
	~template, ~replacement,
	'\\!\\[\\[insert_img_name_here\\]\\]', md_import_strings(request$file_names),
	'insert_text_here', request$text %>% str_replace_all('#', '(#)'),
	'insert_URL_here', request$url
	)

	# Iteratively apply str_replace_all and keep only final result
	reduce2(
	.x = replace_dict$template,
	.y = replace_dict$replacement,
	.f = str_replace_all,
	.init = read_lines(template_name)
	) %>%
	paste0(collapse = '\n') # Collaps lines into a single string
	}

	# Helper functions to write new text file to current working directory
	# Returns file path of new file which is necessary to move files later on
	write_replaced_text <- function(replaced_text, request) {
	file_name <- request$file_name[1] %>% str_replace('_1.png', '.md')
	write_lines(replaced_text, file_name)
	paste0(getwd(), '/', file_name)
	}


	move_files <- function(request, replaced_template, vault_location, attachments_dir) {
	# Create from-to dictionary with file paths in each column
	move_dict <- tribble(
	~from, ~to,
	request$file_path, paste0(vault_location, '/', attachments_dir),
	replaced_template, vault_location
	) %>%
	unnest(cols = 'from')
	# Copy files from current working directory to destination
	move_dict %>% pwalk(file.copy, overwrite = T)
	# Delete files in current working directory
	walk(move_dict$from, file.remove)
	}

	decode_encoded_mails <- function(encoded_mails) {
	# Ressource: https://stackoverflow.com/questions/71772972/translate-encoding-of-android-mail-in-r
	# Find location in each encoded string where actual text starts
	start_encoded <- encoded_mails %>%
	str_locate_all('base64\r\n\r\n') %>%
	map(~pluck(., 4) + 1) %>%
	unlist()

	# Find location in each encoded string where actual text starts
	end_encoded <- encoded_mails %>%
	str_locate_all('----') %>%
	map(~pluck(., 3) - 1)%>%
	unlist()

	# Use str_sub() to extract encoded text
	encoded_text <- tibble(
	string = unlist(encoded_mails),
	start = start_encoded,
	end = end_encoded
	) %>%
	pmap(str_sub)

	# Decode: base64 -> raw -> char -> html -> text
	encoded_text %>%
	map(base64enc::base64decode) %>%
	map(rawToChar) %>%
	map(rvest::read_html) %>%
	map(rvest::html_text2)
	}

	decode_all_mails <- function(mail_bodys) {
	# Decode in case mail is base64 decoded
	is_encoded <- str_detect(mail_bodys, 'Content-Transfer-Encoding')
	encoded_mails <- mail_bodys[is_encoded]
	plain_mails <- mail_bodys[!is_encoded]
	decoded_mails <- encoded_mails %>% decode_encoded_mails()
	c(decoded_mails, plain_mails)

	}

	### Actual script -----------------------------------------------------------

	# Establish connection to imap server
	con <- configure_imap(
	url = imap_mail,
	user = user_mail,
	password = password_mail
	)

	# Switch to Inbox
	con$select_folder('Inbox')

	# Extract mails that are from the list of allowed senders
	mails <- allowed_senders %>%
	map(~con$search_string(expr = ., where = 'FROM')) %>%
	unlist() %>%
	na.omit() %>% # Remove NAs if no mail from a sender
	as.numeric() # avoids attributes

	# If a new mail is detected, begin processing
	if (!is_empty(mails)) {
	# Grab mail texts and URLs
	mail_bodys <- mails %>% con$fetch_text() %>% decode_all_mails
	urls <- mail_bodys %>% str_extract('https.*')

	# Remove mails from vector in case sth goes wrong and urls cannot be detected
	mail_bodys <- mail_bodys[!is.na(urls)]
	mails <- mails[!is.na(urls)]
	urls <- urls[!is.na(urls)]

	# For each url request twitter data
	requests <- map(urls, request_twitter_data, bearer_token = bearer_token)

	# Use requested twitter data to insert texts into Markdown template and write
	# to current working directory
	replaced_templates <-
	map(requests, replace_template_placeholder, template = 'template.md') %>%
	map2(mail_bodys, ~str_replace(.x, 'insert_mail_here' ,.y)) %>%
	map2(requests, ~write_replaced_text(.x, .y))

	# Move markdown files and extracted pngs to correct place on HDD
	walk2(
	requests,
	replaced_templates,
	move_files,
	vault_location = vault_location,
	attachments_dir = attachments_dir
	)

	# Move emails on imap server to Processed directory
	con$move_msg(mails, to_folder = 'Processed')
	}