RickPack/SurveyMonkey_excel_email_cleaner.R

## SurveyMonkey_excel_email_cleaner.R
####################################################
##  Reads email addresses copied and pasted from  ##
##  Surveymonkey screens into .txt files and then ##
##  removes them from the original list. The      ##
##  final Excel workbook has the cleaned emails,  ##
##  a copy of the previously created second sheet ##
##  (invalid emails, produced by                  ##
##  excel_email_cleaner_new.R, found at:          ##
##  https://gist.github.com/RickPack/907a200cd40c786e19d045b379527f6d)  ##
##  and then worksheets with (i) bounced email    ##
##  addresses and (ii) opt-out + unsubscribes     ##
####################################################

library(readr)
library(readxl)
library(dplyr)
library(openxlsx)

out_folder <- 'Output/'
fl <- paste0(out_folder, 'old100BM_Email_Contacts.xlsx')
# Read the first sheet in the last cleaned emails Excel file in the output folder
# Sheet is named Valid_2019_02_14
originalxls1 <- read_excel(fl, col_types = "text", sheet = 1)
# Now sheet 2, named Invalid_2019_02_14
originalxls2 <- read_excel(fl, col_types = "text", sheet = 2)
# Read the bounced email addresses copied and pasted from SurveyMonkey
# read_delim is used because fortunately, copying and pasting caused
# columns to be separated by spaces, which I set as the delimiter.
# This causes a false read of other columns but I only want the first
# (email_addresses), hence the use of select(1)
bounced_emails <- readr::read_delim('Survey1_bounces.txt', " ") %>%
  select(1) %>%
  mutate(Email_Address = toupper(stringr::str_trim(Email_Address)))
unsub_emails   <- readr::read_delim('Survey1_unsubscribes.txt', " ") %>%
  select(1) %>%
  mutate(Email_Address = toupper(stringr::str_trim(Email_Address)))

# Confirm email counts match .txt files
nrow(bounced_emails)
nrow(unsub_emails)

# Now let's remove the email addresses that were either bounced or unsubscribes
remove_emails <- bind_rows(bounced_emails, unsub_emails)
## Are there any strange characters that we want to remove?
remove_emails %>%
  dplyr::filter(grepl("[^A-Z0-9_@.\\-]", Email_Address))
## No additional cleaning needed so let's remove any email addresses
##  from the original emails that were found in the remove list
newxls1  <- anti_join(originalxls1, remove_emails)

# Count of emails in list went from 1593 to 1388

# Finally, let's create the new output Excel file
###########################################
#  This is code using openxlsx functions  #
#   that I often reuse.                   #
#  It emboldens and enlarges the header   #
#   row and facilitates adding worksheets #
#   with desired names.                   #
###########################################
### Create the Excel workbook
### style to embolden the first row
bldStyle  <- createStyle(fontSize = 14, fontColour = "black", textDecoration = c("BOLD"))
### function to position data frames on separate worksheets, then save workbook at end
### When n_xlsx argument equals max_nxlsx argument, save workbook
xlsxformat <- function(wb, namxlsx="", wksht_name, df_inxlsx, nxlsx, max_nxlsx){
  if (nxlsx == 1) {
    wb <- createWorkbook()
  }
  addWorksheet(wb, wksht_name)
  writeData(wb, nxlsx, df_inxlsx, colNames = TRUE, headerStyle = bldStyle)
  setColWidths(wb, sheet = nxlsx, cols = 1:ncol(df_inxlsx), widths = "auto")
  if (max_nxlsx == nxlsx) {
    saveWorkbook(wb, paste0(namxlsx, ".xlsx"), overwrite = TRUE)
  }
  invisible(wb)
}

date_creation <- gsub("-", "_", Sys.Date())

wb <- xlsxformat(wb,
                 wksht_name = paste0("Valid_", date_creation),
                 df_inxlsx = newxls1, nxlsx = 1, max_nxlsx = 4)
wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
                 wksht_name = paste0("Invalid_", date_creation),
                 df_inxlsx = originalxls2, nxlsx = 2, max_nxlsx = 4)
wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
                 wksht_name = "Bounced Emails",
                 df_inxlsx = bounced_emails, nxlsx = 3, max_nxlsx = 4)
wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
                 wksht_name = "Unsubscribed Emails",
                 df_inxlsx = unsub_emails, nxlsx = 4, max_nxlsx = 4)
	####################################################
	## Reads email addresses copied and pasted from ##
	## Surveymonkey screens into .txt files and then ##
	## removes them from the original list. The ##
	## final Excel workbook has the cleaned emails, ##
	## a copy of the previously created second sheet ##
	## (invalid emails, produced by ##
	## excel_email_cleaner_new.R, found at: ##
	## https://gist.github.com/RickPack/907a200cd40c786e19d045b379527f6d) ##
	## and then worksheets with (i) bounced email ##
	## addresses and (ii) opt-out + unsubscribes ##
	####################################################

	library(readr)
	library(readxl)
	library(dplyr)
	library(openxlsx)

	out_folder <- 'Output/'
	fl <- paste0(out_folder, 'old100BM_Email_Contacts.xlsx')
	# Read the first sheet in the last cleaned emails Excel file in the output folder
	# Sheet is named Valid_2019_02_14
	originalxls1 <- read_excel(fl, col_types = "text", sheet = 1)
	# Now sheet 2, named Invalid_2019_02_14
	originalxls2 <- read_excel(fl, col_types = "text", sheet = 2)
	# Read the bounced email addresses copied and pasted from SurveyMonkey
	# read_delim is used because fortunately, copying and pasting caused
	# columns to be separated by spaces, which I set as the delimiter.
	# This causes a false read of other columns but I only want the first
	# (email_addresses), hence the use of select(1)
	bounced_emails <- readr::read_delim('Survey1_bounces.txt', " ") %>%
	select(1) %>%
	mutate(Email_Address = toupper(stringr::str_trim(Email_Address)))
	unsub_emails <- readr::read_delim('Survey1_unsubscribes.txt', " ") %>%
	select(1) %>%
	mutate(Email_Address = toupper(stringr::str_trim(Email_Address)))

	# Confirm email counts match .txt files
	nrow(bounced_emails)
	nrow(unsub_emails)

	# Now let's remove the email addresses that were either bounced or unsubscribes
	remove_emails <- bind_rows(bounced_emails, unsub_emails)
	## Are there any strange characters that we want to remove?
	remove_emails %>%
	dplyr::filter(grepl("[^A-Z0-9_@.\\-]", Email_Address))
	## No additional cleaning needed so let's remove any email addresses
	## from the original emails that were found in the remove list
	newxls1 <- anti_join(originalxls1, remove_emails)

	# Count of emails in list went from 1593 to 1388

	# Finally, let's create the new output Excel file
	###########################################
	# This is code using openxlsx functions #
	# that I often reuse. #
	# It emboldens and enlarges the header #
	# row and facilitates adding worksheets #
	# with desired names. #
	###########################################
	### Create the Excel workbook
	### style to embolden the first row
	bldStyle <- createStyle(fontSize = 14, fontColour = "black", textDecoration = c("BOLD"))
	### function to position data frames on separate worksheets, then save workbook at end
	### When n_xlsx argument equals max_nxlsx argument, save workbook
	xlsxformat <- function(wb, namxlsx="", wksht_name, df_inxlsx, nxlsx, max_nxlsx){
	if (nxlsx == 1) {
	wb <- createWorkbook()
	}
	addWorksheet(wb, wksht_name)
	writeData(wb, nxlsx, df_inxlsx, colNames = TRUE, headerStyle = bldStyle)
	setColWidths(wb, sheet = nxlsx, cols = 1:ncol(df_inxlsx), widths = "auto")
	if (max_nxlsx == nxlsx) {
	saveWorkbook(wb, paste0(namxlsx, ".xlsx"), overwrite = TRUE)
	}
	invisible(wb)
	}

	date_creation <- gsub("-", "_", Sys.Date())

	wb <- xlsxformat(wb,
	wksht_name = paste0("Valid_", date_creation),
	df_inxlsx = newxls1, nxlsx = 1, max_nxlsx = 4)
	wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
	wksht_name = paste0("Invalid_", date_creation),
	df_inxlsx = originalxls2, nxlsx = 2, max_nxlsx = 4)
	wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
	wksht_name = "Bounced Emails",
	df_inxlsx = bounced_emails, nxlsx = 3, max_nxlsx = 4)
	wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
	wksht_name = "Unsubscribed Emails",
	df_inxlsx = unsub_emails, nxlsx = 4, max_nxlsx = 4)