Last active
June 5, 2019 10:15
-
-
Save RickPack/907a200cd40c786e19d045b379527f6d to your computer and use it in GitHub Desktop.
Excel Email Cleaner and Aggregator function using readxl and openxlsx R packages: Aggregate email addresses from multiple Excel workbooks with varying worksheet names and worksheet numbers, de-duplicate - for non-profit service organization "100 Black Men of Triangle East".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# clean and aggregate email addresses from Excel workbooks with the Tidyverse | |
# 12tailedvampire name reference - link to Kevin Feasel reference - Microsoft AI bootcamp | |
# readxl for opening the Excel workbooks | |
# First test case is an email aggregation for the non-profit, scholarship-raising, | |
# youth mentorship program-leading organization 100 Black Men of Triangle East | |
# NOTE: Currently needs Excel column names on the first row | |
# !! Set this variable to the folder containing the Microsoft Excel files | |
dir_emails <- "C:/Users/rick2/Documents/100bm/content_agg/" | |
# Uncomment these lines to install R packages | |
# packages <- c("dplyr", "stringr", "readxl", "purrr", "openxlsx", "tidyr") | |
# if (length(setdiff(packages, rownames(installed.packages()))) > 0) { | |
# install.packages(setdiff(packages, rownames(installed.packages()))) | |
# } | |
# https://gist.github.com/RickPack/907a200cd40c786e19d045b379527f6d | |
date_creation <- gsub("-", "_", Sys.Date()) | |
library( readxl ) | |
library( stringr ) | |
library( dplyr ) | |
library( purrr ) | |
library( tidyr ) | |
library( openxlsx ) | |
excel_email_cleaner <- function(dir) { | |
out_folder <- paste0(dir_emails,"Output/") | |
dir.create(out_folder, showWarnings = FALSE) | |
# get the list of Excel files to process | |
fil_lst <- list.files(dir_emails)[grepl("(\\.xlsx)$|(\\.xls)$", tolower(list.files(dir_emails)))] | |
# Avoids processing temporary files (start with a tilde) like: | |
# ~$Copy of 100BMTE contacts (10-10-18).xlsx | |
fil_lst <- fil_lst[str_sub(fil_lst, 1, 1) != "~"] | |
fil_length <- length(fil_lst) | |
all_emails <- data.frame() | |
for (l in 1:fil_length){ | |
fl <- fil_lst[l] | |
sht_length <- length( excel_sheets(fl) ) | |
sht_names_var <- excel_sheets(fl) | |
for(sht in 1:sht_length){ | |
xldf <- read_excel(fl, col_types = "text", sheet = sht) | |
sht_name <- sht_names_var[sht] | |
xldf_col <- colnames(xldf) | |
xldf_col_mod <- str_replace_all(toupper(xldf_col), "[^A-Z]", "") | |
xldf_col_names_all_num <- which(grepl("NAME", xldf_col_mod)) | |
xldf_col_names_all_length <- length(which(grepl("NAME", xldf_col_mod))) | |
xldf_col_names_all <- xldf_col_mod[xldf_col_names_all_num] | |
name_simp_num <- which(str_trim(xldf_col_mod) == "NAME") | |
name_first <- which(grepl("FIRST", xldf_col_mod)) | |
name_mid <- which(grepl("MIDDLE", xldf_col_mod)) | |
name_last <- which(grepl("LAST", xldf_col_mod)) | |
if(xldf_col_names_all_length > 0){ | |
if(length(name_simp_num) > 0){ | |
if(length(name_simp_num) > 1) { | |
message("too many NAME columns") | |
print(paste("e =", e, "of ", length(xldf_col_emails), | |
"sht =", sht, "of ", sht_length, "l =", l, "of", fil_length, | |
"finished"))} | |
if(length(name_simp_num) == 1) { | |
xldf <- xldf %>% mutate(final_name = .[[name_simp_num]]) | |
} | |
} else if (xldf_col_names_all_length == 1){ | |
xldf <- xldf %>% mutate(final_name = .[[xldf_col_names_all_num]]) | |
} else { | |
# Zero (0) for the index will ultimately cause nothing to populate, for that | |
# value, the resulting vector that is a concatenation of | |
# first, middle, and last names | |
# (so no middle name, nothing appears between the first and last name, | |
# or only the NAME column will appear if that ) | |
xldf_col_names_all2 <- c(name_first, name_mid, name_last) | |
xldf_col_names_all3 <- xldf_col_names_all2[!is.na(xldf_col_names_all2)] | |
xldf <- xldf %>% mutate(final_name = | |
purrr::reduce(.[xldf_col_names_all3], paste, sep = " ")) %>% | |
mutate(final_name = | |
str_replace_all(final_name, "(NA)", "")) %>% | |
mutate(final_name = | |
str_squish(final_name)) %>% | |
mutate(final_name = | |
case_when(final_name == "" ~ "UNKNOWN", | |
TRUE ~ final_name)) | |
} | |
} else { | |
message(paste("No Email recipient name column found in workbook", fl, ", sheet ", | |
sht_name)) | |
xldf$final_name = "UNKNOWN" | |
} | |
final_name_num <- which(colnames(xldf)=="final_name") | |
# store column names containing EMAIL as a numeric vector | |
xldf_col_emails_num <- which(grepl("EMAIL", xldf_col_mod)) | |
xldf_col2 <- colnames(xldf) | |
xldf_col_emails <- xldf_col2[xldf_col_emails_num] | |
if(length(xldf_col_emails) == 0){ | |
message(paste("No Email column found in workbook", fl, ", sheet ", | |
sht_name)) | |
next | |
} | |
for (e in 1:length(xldf_col_emails)){ | |
# Create a single-column | |
colnam_email <- xldf_col2[xldf_col_emails_num[e]] | |
xldf_sel <- xldf %>% select(final_name_num, xldf_col_emails[e]) %>% | |
mutate(simple_nam_email := toupper((!!as.name(colnam_email)))) | |
colnam_email_1 <- quo_name(colnam_email) | |
em <- xldf_sel %>% | |
mutate(email_mod = case_when( | |
str_detect(simple_nam_email, "\\<") ~ | |
str_extract(simple_nam_email, | |
"(?<=\\<).*(?=\\>)"), | |
str_detect(simple_nam_email, "\\(") ~ | |
str_extract(simple_nam_email, | |
"(?<=\\().*(?=\\))"), | |
TRUE ~ simple_nam_email)) %>% | |
mutate(email_mod = str_replace_all(toupper(email_mod), | |
"[^A-Z0-9_@.\\-]", "")) %>% | |
distinct(email_mod, .keep_all = TRUE) %>% | |
mutate(CLEANED = case_when( | |
email_mod != simple_nam_email ~ "CLEANED", | |
TRUE ~ ""), | |
INVALID = case_when( | |
str_count(email_mod, "@") != 1 | | |
str_count(email_mod, ".") == 0 ~ "INVALID", | |
TRUE ~ "")) | |
em$Excel_filename <- fl | |
em$sheet_name <- sht_name | |
em$Name <- em$final_name | |
em$Email_Address <- em$email_mod | |
em$original_email <- em$simple_nam_email | |
em$original_email_field <- colnam_email | |
all_emails <- em %>% | |
distinct(Email_Address, .keep_all = TRUE) %>% | |
select(Name, Email_Address, Excel_filename, sheet_name, original_email, | |
original_email_field, CLEANED, INVALID) %>% | |
bind_rows(all_emails, .) | |
message("CLEANED") | |
print(all_emails %>% dplyr::filter(CLEANED=="CLEANED") %>% | |
slice(1:min(3, nrow(.)))) | |
message("INVALID") | |
print(all_emails %>% dplyr::filter(INVALID=="INVALID") %>% | |
slice(1:min(3, nrow(.)))) | |
print(paste("email_column =", e, "of ", length(xldf_col_emails), | |
"worksheet =", sht, "of ", sht_length, "Excel file =", l, "of", fil_length, | |
"finished")) | |
} | |
} | |
} | |
new_xl_frm <- all_emails %>% | |
mutate(name_priority = case_when(Name == "UNKNOWN" | str_trim(Name) == "" ~ 0, | |
TRUE ~ 1)) %>% | |
group_by(Email_Address) %>% | |
slice(which.max(name_priority)) %>% | |
ungroup() %>% | |
select(-name_priority) %>% | |
replace(., is.na(.), "") %>% | |
dplyr::filter(str_length(Email_Address) > 4) %>% | |
distinct(Name, Email_Address, .keep_all = TRUE) | |
invalid_frm <- new_xl_frm %>% dplyr::filter(INVALID == "INVALID") | |
valid_frm <- new_xl_frm %>% dplyr::filter(INVALID != "INVALID") %>% | |
mutate(ct = row_number()) %>% | |
mutate(email_group = ceiling(ct/300)) | |
### Create the Excel workbook | |
### style to embolden the first row | |
bldStyle <- createStyle(fontSize = 14, fontColour = "black", textDecoration = c("BOLD")) | |
### function to position data frames on separate worksheets, then save workbook at end | |
xlsxformat <- function(wb, namxlsx="", wksht_name, df_inxlsx, nxlsx, max_nxlsx){ | |
if(nxlsx==1){ | |
wb <- createWorkbook() | |
} | |
addWorksheet(wb, wksht_name) | |
writeData(wb, nxlsx, df_inxlsx, colNames=TRUE, headerStyle = bldStyle) | |
setColWidths(wb, sheet = nxlsx, cols = 1:ncol(df_inxlsx), widths = "auto") | |
if(max_nxlsx == nxlsx){ | |
saveWorkbook(wb, paste0(namxlsx, ".xlsx"), overwrite = TRUE) | |
} | |
invisible(wb) | |
} | |
# When n_xlsx argument equals max_nxlsx argument, save workbook | |
wb <- xlsxformat(wb, | |
wksht_name = paste0("Valid_", date_creation), | |
df_inxlsx = valid_frm, nxlsx = 1, max_nxlsx = 2) | |
wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"), | |
wksht_name = paste0("Invalid_", date_creation), | |
df_inxlsx = invalid_frm, nxlsx = 2, max_nxlsx = 2) | |
final_lst <- list() | |
final_lst[[1]] <- valid_frm | |
final_lst[[2]] <- invalid_frm | |
invisible(final_lst) | |
} | |
final_lst_out <- excel_email_cleaner(dir_emails) | |
valid_frm_out <- final_lst_out[[1]] | |
invalid_frm_out <- final_lst_out[[2]] | |
message("Notice invalid characters") | |
valid_frm_out %>% dplyr::filter(CLEANED != "") | |
middle <- function(d, n = 5){ | |
N = nrow(d) / 2 | |
print(tbl_df(d[(N-n/2):(N + n - 1 - n / 2),]), n = n) | |
} |
forgot to bind_rows data frames
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
just need to output to the Excel file with saveworkbook function