RickPack/excel_email_cleaner.R

## excel_email_cleaner.R
# clean and aggregate email addresses from Excel workbooks with the Tidyverse
# 12tailedvampire name reference - link to Kevin Feasel reference - Microsoft AI bootcamp
# readxl for opening the Excel workbooks
# First test case is an email aggregation for the non-profit, scholarship-raising,
# youth mentorship program-leading organization 100 Black Men of Triangle East
# NOTE: Currently needs Excel column names on the first row

# !! Set this variable to the folder containing the Microsoft Excel files
dir_emails <- "C:/Users/rick2/Documents/100bm/content_agg/"

# Uncomment these lines to install R packages
# packages <- c("dplyr", "stringr", "readxl", "purrr", "openxlsx", "tidyr")
# if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
#   install.packages(setdiff(packages, rownames(installed.packages())))
# }

# https://gist.github.com/RickPack/907a200cd40c786e19d045b379527f6d

date_creation <- gsub("-", "_", Sys.Date())


library( readxl )
library( stringr )
library( dplyr )
library( purrr )
library( tidyr )
library( openxlsx )

excel_email_cleaner <- function(dir) {
  out_folder <- paste0(dir_emails,"Output/")
  dir.create(out_folder, showWarnings = FALSE)

  # get the list of Excel files to process
  fil_lst <- list.files(dir_emails)[grepl("(\\.xlsx)$|(\\.xls)$", tolower(list.files(dir_emails)))]
  # Avoids processing temporary files (start with a tilde) like:
  #   ~$Copy of 100BMTE contacts (10-10-18).xlsx
  fil_lst <- fil_lst[str_sub(fil_lst, 1, 1) != "~"]

  fil_length <- length(fil_lst)
  all_emails <- data.frame()

  for (l in 1:fil_length){
    fl        <- fil_lst[l]
    sht_length <- length( excel_sheets(fl) )
    sht_names_var  <- excel_sheets(fl)
    for(sht in 1:sht_length){
      xldf          <- read_excel(fl, col_types = "text", sheet = sht)
      sht_name     <- sht_names_var[sht]
      xldf_col        <- colnames(xldf)
      xldf_col_mod    <- str_replace_all(toupper(xldf_col), "[^A-Z]", "")
      xldf_col_names_all_num     <- which(grepl("NAME", xldf_col_mod))
      xldf_col_names_all_length  <- length(which(grepl("NAME", xldf_col_mod)))
      xldf_col_names_all         <- xldf_col_mod[xldf_col_names_all_num]
      name_simp_num  <- which(str_trim(xldf_col_mod) == "NAME")
      name_first <- which(grepl("FIRST", xldf_col_mod))
      name_mid   <- which(grepl("MIDDLE", xldf_col_mod))
      name_last  <- which(grepl("LAST", xldf_col_mod))

      if(xldf_col_names_all_length > 0){
        if(length(name_simp_num) > 0){
          if(length(name_simp_num) > 1) {
            message("too many NAME columns")
            print(paste("e =", e, "of ", length(xldf_col_emails),
                        "sht =", sht, "of ", sht_length, "l =", l, "of", fil_length,
                        "finished"))}
          if(length(name_simp_num) == 1) {
            xldf <- xldf %>% mutate(final_name = .[[name_simp_num]])
          }
        } else if (xldf_col_names_all_length == 1){
          xldf <- xldf %>% mutate(final_name = .[[xldf_col_names_all_num]])
        } else {
          # Zero (0) for the index will ultimately cause nothing to populate, for that
          # value, the resulting vector that is a concatenation of
          # first, middle, and last names
          # (so no middle name, nothing appears between the first and last name,
          # or only the NAME column will appear if that )
          xldf_col_names_all2 <- c(name_first, name_mid, name_last)
          xldf_col_names_all3 <- xldf_col_names_all2[!is.na(xldf_col_names_all2)]
          xldf  <- xldf %>% mutate(final_name =
                                     purrr::reduce(.[xldf_col_names_all3], paste, sep = " ")) %>%
            mutate(final_name =
                     str_replace_all(final_name, "(NA)", "")) %>%
            mutate(final_name =
                     str_squish(final_name)) %>%
            mutate(final_name =
                     case_when(final_name == "" ~ "UNKNOWN",
                               TRUE ~ final_name))
        }
      } else {
        message(paste("No Email recipient name column found in workbook", fl, ", sheet ",
                      sht_name))
        xldf$final_name = "UNKNOWN"
      }

      final_name_num      <-  which(colnames(xldf)=="final_name")
      # store column names containing EMAIL as a numeric vector
      xldf_col_emails_num <- which(grepl("EMAIL", xldf_col_mod))
      xldf_col2       <- colnames(xldf)
      xldf_col_emails <- xldf_col2[xldf_col_emails_num]
      if(length(xldf_col_emails) == 0){
        message(paste("No Email column found in workbook", fl, ", sheet ",
                      sht_name))
        next
      }
      for (e in 1:length(xldf_col_emails)){
        # Create a single-column
        colnam_email   <- xldf_col2[xldf_col_emails_num[e]]
        xldf_sel       <- xldf %>% select(final_name_num, xldf_col_emails[e]) %>%
          mutate(simple_nam_email := toupper((!!as.name(colnam_email))))
        colnam_email_1 <- quo_name(colnam_email)
        em <- xldf_sel %>%
          mutate(email_mod = case_when(
            str_detect(simple_nam_email, "\\<") ~
              str_extract(simple_nam_email,
                          "(?<=\\<).*(?=\\>)"),
            str_detect(simple_nam_email, "\\(") ~
              str_extract(simple_nam_email,
                          "(?<=\\().*(?=\\))"),
            TRUE ~ simple_nam_email)) %>%
          mutate(email_mod = str_replace_all(toupper(email_mod),
                                             "[^A-Z0-9_@.\\-]", "")) %>%
          distinct(email_mod, .keep_all = TRUE) %>%
          mutate(CLEANED = case_when(
            email_mod !=  simple_nam_email ~ "CLEANED",
            TRUE ~ ""),
            INVALID = case_when(
              str_count(email_mod, "@") != 1 |
                str_count(email_mod, ".") == 0 ~ "INVALID",
              TRUE ~ ""))

        em$Excel_filename <- fl
        em$sheet_name     <- sht_name
        em$Name           <- em$final_name
        em$Email_Address  <- em$email_mod
        em$original_email <- em$simple_nam_email
        em$original_email_field <- colnam_email
        all_emails <-  em %>%
          distinct(Email_Address, .keep_all = TRUE) %>%
          select(Name, Email_Address, Excel_filename, sheet_name, original_email,
                 original_email_field, CLEANED, INVALID) %>%
          bind_rows(all_emails, .)

        message("CLEANED")
        print(all_emails %>% dplyr::filter(CLEANED=="CLEANED") %>%
                slice(1:min(3, nrow(.))))
        message("INVALID")
        print(all_emails %>% dplyr::filter(INVALID=="INVALID") %>%
                slice(1:min(3, nrow(.))))

        print(paste("email_column =", e, "of ", length(xldf_col_emails),
                    "worksheet =", sht, "of ", sht_length, "Excel file =", l, "of", fil_length,
                    "finished"))
      }
    }
  }

  new_xl_frm <- all_emails %>%
    mutate(name_priority = case_when(Name == "UNKNOWN" | str_trim(Name) == "" ~ 0,
                                     TRUE ~ 1)) %>%
    group_by(Email_Address) %>%
    slice(which.max(name_priority)) %>%
    ungroup() %>%
    select(-name_priority) %>%
    replace(., is.na(.), "") %>%
    dplyr::filter(str_length(Email_Address) > 4) %>%
    distinct(Name, Email_Address, .keep_all = TRUE)

  invalid_frm <- new_xl_frm %>% dplyr::filter(INVALID == "INVALID")
  valid_frm <- new_xl_frm %>% dplyr::filter(INVALID != "INVALID") %>%
    mutate(ct = row_number()) %>%
    mutate(email_group = ceiling(ct/300))

  ### Create the Excel workbook
  ### style to embolden the first row
  bldStyle  <- createStyle(fontSize = 14, fontColour = "black", textDecoration = c("BOLD"))
  ### function to position data frames on separate worksheets, then save workbook at end
  xlsxformat <- function(wb, namxlsx="", wksht_name, df_inxlsx, nxlsx, max_nxlsx){
    if(nxlsx==1){
      wb <- createWorkbook()
    }
    addWorksheet(wb, wksht_name)
    writeData(wb, nxlsx, df_inxlsx, colNames=TRUE, headerStyle = bldStyle)
    setColWidths(wb, sheet = nxlsx, cols = 1:ncol(df_inxlsx), widths = "auto")
    if(max_nxlsx == nxlsx){
      saveWorkbook(wb, paste0(namxlsx, ".xlsx"), overwrite = TRUE)
    }
    invisible(wb)
  }

  # When n_xlsx argument equals max_nxlsx argument, save workbook
  wb <- xlsxformat(wb,
                   wksht_name = paste0("Valid_", date_creation),
                   df_inxlsx = valid_frm, nxlsx = 1, max_nxlsx = 2)
  wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
                   wksht_name = paste0("Invalid_", date_creation),
                   df_inxlsx = invalid_frm, nxlsx = 2, max_nxlsx = 2)

  final_lst <- list()
  final_lst[[1]]  <- valid_frm
  final_lst[[2]] <- invalid_frm
  invisible(final_lst)
}

final_lst_out <- excel_email_cleaner(dir_emails)
valid_frm_out   <- final_lst_out[[1]]
invalid_frm_out <- final_lst_out[[2]]

message("Notice invalid characters")
valid_frm_out %>% dplyr::filter(CLEANED != "")

middle <- function(d, n = 5){
  N = nrow(d) / 2
  print(tbl_df(d[(N-n/2):(N + n - 1 - n / 2),]), n = n)
}
	# clean and aggregate email addresses from Excel workbooks with the Tidyverse
	# 12tailedvampire name reference - link to Kevin Feasel reference - Microsoft AI bootcamp
	# readxl for opening the Excel workbooks
	# First test case is an email aggregation for the non-profit, scholarship-raising,
	# youth mentorship program-leading organization 100 Black Men of Triangle East
	# NOTE: Currently needs Excel column names on the first row

	# !! Set this variable to the folder containing the Microsoft Excel files
	dir_emails <- "C:/Users/rick2/Documents/100bm/content_agg/"

	# Uncomment these lines to install R packages
	# packages <- c("dplyr", "stringr", "readxl", "purrr", "openxlsx", "tidyr")
	# if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
	# install.packages(setdiff(packages, rownames(installed.packages())))
	# }

	# https://gist.github.com/RickPack/907a200cd40c786e19d045b379527f6d

	date_creation <- gsub("-", "_", Sys.Date())


	library( readxl )
	library( stringr )
	library( dplyr )
	library( purrr )
	library( tidyr )
	library( openxlsx )

	excel_email_cleaner <- function(dir) {
	out_folder <- paste0(dir_emails,"Output/")
	dir.create(out_folder, showWarnings = FALSE)

	# get the list of Excel files to process
	fil_lst <- list.files(dir_emails)[grepl("(\\.xlsx)$\|(\\.xls)$", tolower(list.files(dir_emails)))]
	# Avoids processing temporary files (start with a tilde) like:
	# ~$Copy of 100BMTE contacts (10-10-18).xlsx
	fil_lst <- fil_lst[str_sub(fil_lst, 1, 1) != "~"]

	fil_length <- length(fil_lst)
	all_emails <- data.frame()

	for (l in 1:fil_length){
	fl <- fil_lst[l]
	sht_length <- length( excel_sheets(fl) )
	sht_names_var <- excel_sheets(fl)
	for(sht in 1:sht_length){
	xldf <- read_excel(fl, col_types = "text", sheet = sht)
	sht_name <- sht_names_var[sht]
	xldf_col <- colnames(xldf)
	xldf_col_mod <- str_replace_all(toupper(xldf_col), "[^A-Z]", "")
	xldf_col_names_all_num <- which(grepl("NAME", xldf_col_mod))
	xldf_col_names_all_length <- length(which(grepl("NAME", xldf_col_mod)))
	xldf_col_names_all <- xldf_col_mod[xldf_col_names_all_num]
	name_simp_num <- which(str_trim(xldf_col_mod) == "NAME")
	name_first <- which(grepl("FIRST", xldf_col_mod))
	name_mid <- which(grepl("MIDDLE", xldf_col_mod))
	name_last <- which(grepl("LAST", xldf_col_mod))

	if(xldf_col_names_all_length > 0){
	if(length(name_simp_num) > 0){
	if(length(name_simp_num) > 1) {
	message("too many NAME columns")
	print(paste("e =", e, "of ", length(xldf_col_emails),
	"sht =", sht, "of ", sht_length, "l =", l, "of", fil_length,
	"finished"))}
	if(length(name_simp_num) == 1) {
	xldf <- xldf %>% mutate(final_name = .[[name_simp_num]])
	}
	} else if (xldf_col_names_all_length == 1){
	xldf <- xldf %>% mutate(final_name = .[[xldf_col_names_all_num]])
	} else {
	# Zero (0) for the index will ultimately cause nothing to populate, for that
	# value, the resulting vector that is a concatenation of
	# first, middle, and last names
	# (so no middle name, nothing appears between the first and last name,
	# or only the NAME column will appear if that )
	xldf_col_names_all2 <- c(name_first, name_mid, name_last)
	xldf_col_names_all3 <- xldf_col_names_all2[!is.na(xldf_col_names_all2)]
	xldf <- xldf %>% mutate(final_name =
	purrr::reduce(.[xldf_col_names_all3], paste, sep = " ")) %>%
	mutate(final_name =
	str_replace_all(final_name, "(NA)", "")) %>%
	mutate(final_name =
	str_squish(final_name)) %>%
	mutate(final_name =
	case_when(final_name == "" ~ "UNKNOWN",
	TRUE ~ final_name))
	}
	} else {
	message(paste("No Email recipient name column found in workbook", fl, ", sheet ",
	sht_name))
	xldf$final_name = "UNKNOWN"
	}

	final_name_num <- which(colnames(xldf)=="final_name")
	# store column names containing EMAIL as a numeric vector
	xldf_col_emails_num <- which(grepl("EMAIL", xldf_col_mod))
	xldf_col2 <- colnames(xldf)
	xldf_col_emails <- xldf_col2[xldf_col_emails_num]
	if(length(xldf_col_emails) == 0){
	message(paste("No Email column found in workbook", fl, ", sheet ",
	sht_name))
	next
	}
	for (e in 1:length(xldf_col_emails)){
	# Create a single-column
	colnam_email <- xldf_col2[xldf_col_emails_num[e]]
	xldf_sel <- xldf %>% select(final_name_num, xldf_col_emails[e]) %>%
	mutate(simple_nam_email := toupper((!!as.name(colnam_email))))
	colnam_email_1 <- quo_name(colnam_email)
	em <- xldf_sel %>%
	mutate(email_mod = case_when(
	str_detect(simple_nam_email, "\\<") ~
	str_extract(simple_nam_email,
	"(?<=\\<).*(?=\\>)"),
	str_detect(simple_nam_email, "\\(") ~
	str_extract(simple_nam_email,
	"(?<=\\().*(?=\\))"),
	TRUE ~ simple_nam_email)) %>%
	mutate(email_mod = str_replace_all(toupper(email_mod),
	"[^A-Z0-9_@.\\-]", "")) %>%
	distinct(email_mod, .keep_all = TRUE) %>%
	mutate(CLEANED = case_when(
	email_mod != simple_nam_email ~ "CLEANED",
	TRUE ~ ""),
	INVALID = case_when(
	str_count(email_mod, "@") != 1 \|
	str_count(email_mod, ".") == 0 ~ "INVALID",
	TRUE ~ ""))

	em$Excel_filename <- fl
	em$sheet_name <- sht_name
	em$Name <- em$final_name
	em$Email_Address <- em$email_mod
	em$original_email <- em$simple_nam_email
	em$original_email_field <- colnam_email
	all_emails <- em %>%
	distinct(Email_Address, .keep_all = TRUE) %>%
	select(Name, Email_Address, Excel_filename, sheet_name, original_email,
	original_email_field, CLEANED, INVALID) %>%
	bind_rows(all_emails, .)

	message("CLEANED")
	print(all_emails %>% dplyr::filter(CLEANED=="CLEANED") %>%
	slice(1:min(3, nrow(.))))
	message("INVALID")
	print(all_emails %>% dplyr::filter(INVALID=="INVALID") %>%
	slice(1:min(3, nrow(.))))

	print(paste("email_column =", e, "of ", length(xldf_col_emails),
	"worksheet =", sht, "of ", sht_length, "Excel file =", l, "of", fil_length,
	"finished"))
	}
	}
	}

	new_xl_frm <- all_emails %>%
	mutate(name_priority = case_when(Name == "UNKNOWN" \| str_trim(Name) == "" ~ 0,
	TRUE ~ 1)) %>%
	group_by(Email_Address) %>%
	slice(which.max(name_priority)) %>%
	ungroup() %>%
	select(-name_priority) %>%
	replace(., is.na(.), "") %>%
	dplyr::filter(str_length(Email_Address) > 4) %>%
	distinct(Name, Email_Address, .keep_all = TRUE)

	invalid_frm <- new_xl_frm %>% dplyr::filter(INVALID == "INVALID")
	valid_frm <- new_xl_frm %>% dplyr::filter(INVALID != "INVALID") %>%
	mutate(ct = row_number()) %>%
	mutate(email_group = ceiling(ct/300))

	### Create the Excel workbook
	### style to embolden the first row
	bldStyle <- createStyle(fontSize = 14, fontColour = "black", textDecoration = c("BOLD"))
	### function to position data frames on separate worksheets, then save workbook at end
	xlsxformat <- function(wb, namxlsx="", wksht_name, df_inxlsx, nxlsx, max_nxlsx){
	if(nxlsx==1){
	wb <- createWorkbook()
	}
	addWorksheet(wb, wksht_name)
	writeData(wb, nxlsx, df_inxlsx, colNames=TRUE, headerStyle = bldStyle)
	setColWidths(wb, sheet = nxlsx, cols = 1:ncol(df_inxlsx), widths = "auto")
	if(max_nxlsx == nxlsx){
	saveWorkbook(wb, paste0(namxlsx, ".xlsx"), overwrite = TRUE)
	}
	invisible(wb)
	}

	# When n_xlsx argument equals max_nxlsx argument, save workbook
	wb <- xlsxformat(wb,
	wksht_name = paste0("Valid_", date_creation),
	df_inxlsx = valid_frm, nxlsx = 1, max_nxlsx = 2)
	wb <- xlsxformat(wb, namxlsx = paste0(out_folder, "100BM_Email_Contacts"),
	wksht_name = paste0("Invalid_", date_creation),
	df_inxlsx = invalid_frm, nxlsx = 2, max_nxlsx = 2)

	final_lst <- list()
	final_lst[[1]] <- valid_frm
	final_lst[[2]] <- invalid_frm
	invisible(final_lst)
	}

	final_lst_out <- excel_email_cleaner(dir_emails)
	valid_frm_out <- final_lst_out[[1]]
	invalid_frm_out <- final_lst_out[[2]]

	message("Notice invalid characters")
	valid_frm_out %>% dplyr::filter(CLEANED != "")

	middle <- function(d, n = 5){
	N = nrow(d) / 2
	print(tbl_df(d[(N-n/2):(N + n - 1 - n / 2),]), n = n)
	}