Skip to content

Instantly share code, notes, and snippets.

@RickPack
Last active June 5, 2019 10:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RickPack/907a200cd40c786e19d045b379527f6d to your computer and use it in GitHub Desktop.
Save RickPack/907a200cd40c786e19d045b379527f6d to your computer and use it in GitHub Desktop.
Excel Email Cleaner and Aggregator function using readxl and openxlsx R packages: Aggregate email addresses from multiple Excel workbooks with varying worksheet names and worksheet numbers, de-duplicate - for non-profit service organization "100 Black Men of Triangle East".
# clean and aggregate email addresses from Excel workbooks with the Tidyverse
# 12tailedvampire name reference - link to Kevin Feasel reference - Microsoft AI bootcamp
# readxl for opening the Excel workbooks
# First test case is an email aggregation for the non-profit, scholarship-raising,
# youth mentorship program-leading organization 100 Black Men of Triangle East
# NOTE: Currently needs Excel column names on the first row
# !! Set this variable to the folder containing the Microsoft Excel files
dir_emails <- "C:/Users/rick2/Documents/100bm/content_agg/"
# Uncomment these lines to install R packages
# packages <- c("dplyr", "stringr", "readxl", "purrr", "openxlsx", "tidyr")
# if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
# install.packages(setdiff(packages, rownames(installed.packages())))
# }
# https://gist.github.com/RickPack/907a200cd40c786e19d045b379527f6d
date_creation <- gsub("-", "_", Sys.Date())
library( readxl )
library( stringr )
library( dplyr )
library( purrr )
library( tidyr )
library( openxlsx )
excel_email_cleaner <- function(dir) {
out_folder <- paste0(dir_emails,"Output/")
dir.create(out_folder, showWarnings = FALSE)
# get the list of Excel files to process
fil_lst <- list.files(dir_emails)[grepl("(\\.xlsx)$|(\\.xls)$", tolower(list.files(dir_emails)))]
# Avoids processing temporary files (start with a tilde) like:
# ~$Copy of 100BMTE contacts (10-10-18).xlsx
fil_lst <- fil_lst[str_sub(fil_lst, 1, 1) != "~"]
fil_length <- length(fil_lst)
all_emails <- data.frame()
for (l in 1:fil_length){
fl <- fil_lst[l]
sht_length <- length( excel_sheets(fl) )
sht_names_var <- excel_sheets(fl)
for(sht in 1:sht_length){
xldf <- read_excel(fl, col_types = "text", sheet = sht)
sht_name <- sht_names_var[sht]
xldf_col <- colnames(xldf)
xldf_col_mod <- str_replace_all(toupper(xldf_col), "[^A-Z]", "")
xldf_col_names_all_num <- which(grepl("NAME", xldf_col_mod))
xldf_col_names_all_length <- length(which(grepl("NAME", xldf_col_mod)))
xldf_col_names_all <- xldf_col_mod[xldf_col_names_all_num]
name_simp_num <- which(str_trim(xldf_col_mod) == "NAME")
name_first <- which(grepl("FIRST", xldf_col_mod))
name_mid <- which(grepl("MIDDLE", xldf_col_mod))
name_last <- which(grepl("LAST", xldf_col_mod))
if(xldf_col_names_all_length > 0){
if(length(name_simp_num) > 0){
if(length(name_simp_num) > 1) {
message("too many NAME columns")
print(paste("e =", e, "of ", length(xldf_col_emails),
"sht =", sht, "of ", sht_length, "l =", l, "of", fil_length,
"finished"))}
if(length(name_simp_num) == 1) {
xldf <- xldf %>% mutate(final_name = .[[name_simp_num]])
}
} else if (xldf_col_names_all_length == 1){
xldf <- xldf %>% mutate(final_name = .[[xldf_col_names_all_num]])
} else {
# Zero (0) for the index will ultimately cause nothing to populate, for that
# value, the resulting vector that is a concatenation of
# first, middle, and last names
# (so no middle name, nothing appears between the first and last name,
# or only the NAME column will appear if that )
xldf_col_names_all2 <- c(name_first, name_mid, name_last)
xldf_col_names_all3 <- xldf_col_names_all2[!is.na(xldf_col_names_all2)]
xldf <- xldf %>% mutate(final_name =
purrr::reduce(.[xldf_col_names_all3], paste, sep = " ")) %>%
mutate(final_name =
str_replace_all(final_name, "(NA)", "")) %>%
mutate(final_name =
str_squish(final_name)) %>%
mutate(final_name =
case_when(final_name == "" ~ "UNKNOWN",
TRUE ~ final_name))
}
} else {
message(paste("No Email recipient name column found in workbook", fl, ", sheet ",
sht_name))
xldf$final_name = "UNKNOWN"
}
final_name_num <- which(colnames(xldf)=="final_name")
# store column names containing EMAIL as a numeric vector
xldf_col_emails_num <- which(grepl("EMAIL", xldf_col_mod))
xldf_col2 <- colnames(xldf)
xldf_col_emails <- xldf_col2[xldf_col_emails_num]
if(length(xldf_col_emails) == 0){
message(paste("No Email column found in workbook", fl, ", sheet ",
sht_name))
next
}
for (e in 1:length(xldf_col_emails)){
# Create a single-column
colnam_email <- xldf_col2[xldf_col_emails_num[e]]
xldf_sel <- xldf %>% select(final_name_num, xldf_col_emails[e]) %>%
mutate(simple_nam_email := toupper((!!as.name(colnam_email))))
colnam_email_1 <- quo_name(colnam_email)
em <- xldf_sel %>%
mutate(email_mod = case_when(
str_detect(simple_nam_email, "\\<") ~
str_extract(simple_nam_email,
"(?<=\\<).*(?=\\>)"),
str_detect(simple_nam_email, "\\(") ~
str_extract(simple_nam_email,
"(?<=\\().*(?=\\))"),
TRUE ~ simple_nam_email)) %>%
mutate(email_mod = str_replace_all(toupper(email_mod),
"[^A-Z0-9_@.\\-]", "")) %>%
distinct(email_mod, .keep_all = TRUE) %>%
mutate(CLEANED = case_when(
email_mod != simple_nam_email ~ "CLEANED",
TRUE ~ ""),
INVALID = case_when(
str_count(email_mod, "@") != 1 |
str_count(email_mod, ".") == 0 ~ "INVALID",
TRUE ~ ""))
em$Excel_filename <- fl
em$sheet_name <- sht_name
em$Name <- em$final_name
em$Email_Address <- em$email_mod
em$original_email <- em$simple_nam_email
em$original_email_field <- colnam_email
all_emails <- em %>%
distinct(Email_Address, .keep_all = TRUE) %>%
select(Name, Email_Address, Excel_filename, sheet_name, original_email,
original_email_field, CLEANED, INVALID) %>%
bind_rows(all_emails, .)
message("CLEANED")
print(all_emails %>% dplyr::filter(CLEANED=="CLEANED") %>%
slice(1:min(3, nrow(.))))
message("INVALID")
print(all_emails %>% dplyr::filter(INVALID=="INVALID") %>%
slice(1:min(3, nrow(.))))
print(paste("email_column =", e, "of ", length(xldf_col_emails),
"worksheet =", sht, "of ", sht_length, "Excel file =", l, "of", fil_length,
"finished"))
}
}
}
new_xl_frm <- all_emails %>%
mutate(name_priority = case_when(Name == "UNKNOWN" | str_trim(Name) == "" ~ 0,
TRUE ~ 1)) %>%
group_by(Email_Address) %>%
slice(which.max(name_priority)) %>%
ungroup() %>%
select(-name_priority) %>%
replace(., is.na(.), "") %>%
dplyr::filter(str_length(Email_Address) > 4) %>%
distinct(Name, Email_Address, .keep_all = TRUE)
invalid_frm <- new_xl_frm %>% dplyr::filter(INVALID == "INVALID")
valid_frm <- new_xl_frm %>% dplyr::filter(INVALID != "INVALID") %>%
mutate(ct = row_number()) %>%
mutate(email_group = ceiling(ct/300))
### Create the Excel workbook
### style to embolden the first row
bldStyle <- createStyle(fontSize = 14, fontColour = "black", textDecoration = c("BOLD"))
### function to position data frames on separate worksheets, then save workbook at end
xlsxformat <- function(wb, namxlsx="", wksht_name, df_inxlsx, nxlsx, max_nxlsx){
if(nxlsx==1){
wb <- createWorkbook()
}
addWorksheet(wb, wksht_name)
writeData(wb, nxlsx, df_inxlsx, colNames=TRUE, headerStyle = bldStyle)
setColWidths(wb, sheet = nxlsx, cols = 1:ncol(df_inxlsx), widths = "auto")
if(max_nxlsx == nxlsx){
saveWorkbook(wb, paste0(namxlsx, ".xlsx"), overwrite = TRUE)
}
invisible(wb)
}
# When n_xlsx argument equals max_nxlsx argument, save workbook
wb <- xlsxformat(wb,
wksht_name = paste0("Valid_", date_creation),
df_inxlsx = valid_frm, nxlsx = 1, max_nxlsx = 2)
wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
wksht_name = paste0("Invalid_", date_creation),
df_inxlsx = invalid_frm, nxlsx = 2, max_nxlsx = 2)
final_lst <- list()
final_lst[[1]] <- valid_frm
final_lst[[2]] <- invalid_frm
invisible(final_lst)
}
final_lst_out <- excel_email_cleaner(dir_emails)
valid_frm_out <- final_lst_out[[1]]
invalid_frm_out <- final_lst_out[[2]]
message("Notice invalid characters")
valid_frm_out %>% dplyr::filter(CLEANED != "")
middle <- function(d, n = 5){
N = nrow(d) / 2
print(tbl_df(d[(N-n/2):(N + n - 1 - n / 2),]), n = n)
}
@RickPack
Copy link
Author

just need to output to the Excel file with saveworkbook function

@RickPack
Copy link
Author

forgot to bind_rows data frames

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment