Skip to content

Instantly share code, notes, and snippets.

@tjvananne
Last active May 3, 2017 15:53
Show Gist options
  • Save tjvananne/ea37f1597452c9fc75d80133bf1d7456 to your computer and use it in GitHub Desktop.
Save tjvananne/ea37f1597452c9fc75d80133bf1d7456 to your computer and use it in GitHub Desktop.
Generate Generic Data Dictionary in R. It will count the number of blanks, the number of NAs, tell you the number of unique values per column, calculate the percentages of the previously mentioned column aggregations, and report out the top n (5 is default) number of unique values per row.
# generic data dictionary creation using base-R
#' a couple notes: this could of course be done much faster using
#' third party packages, but I like to provide base-R solutions before
#' branching out into packages just in case they aren't available
#'
#' Also, this could be done in a much less verbose and modular way,
#' but I did want to also demonstrate the "Functional Programming"
#' abilities within R (although no functions were passed to functions)
#' and wanted this to be a helpful reference gist
# function definitions ---------------------------------------------------------------------
# these are to be used on a single data frame column, preferrably in a "sapply()" call
func_unq <- function(df_col, n_returned=5) {
return(unique(df_col)[1:n_returned])
}
func_unq_ln <- function(df_col) {
return(length(unique(df_col)))
}
func_NAs <- function(df_col) {
return(sum(is.na(df_col)))
}
func_blanks <- function(df_col) {
return(sum( trimws(df_col) == '', na.rm=T))
}
# this is the wrapper function around the lower-level ones above
func_build_data_dict <- function(df, n_returned=5) {
t_start <- Sys.time()
# # for testing:
# df <- dataset
# n_returned <- 5
# initialize some space for collection
dict_all <- list()
# clear current environment from all of the "this_"
# if this is in a function, will it affect things outside the function? #YOLO
rm(list=ls()[grepl("^this_", ls())])
# execute all lower level funcs
print(paste0("Gathering first ", n_returned, " unique values per column..."))
this_unq <- data.frame(t(sapply(df, func_unq, n_returned=n_returned)), stringsAsFactors = F)
names(this_unq) <- paste0('unique_val_', 1:ncol(this_unq))
print("Calculating how many unique values per column...")
this_unq_ln <- data.frame(sapply(df, func_unq_ln), stringsAsFactors = F)
names(this_unq_ln) <- 'count_of_unique_values'
print("Counting the 'NA' values per column...")
this_NAs <- data.frame(sapply(df, func_NAs), stringsAsFactors = F)
names(this_NAs) <- 'count_of_NA_values'
print("Counting the blank values per column...")
this_blanks <- data.frame(sapply(df, func_blanks), stringsAsFactors = F)
names(this_blanks) <- 'count_of_blanks'
# isolate the "this_" objects in environment, loop through and add to collection
print("Combinding results into data dictionary...")
all_of_this <- ls()[grepl("^this_", ls())]
for(i in seq_along(all_of_this)) {
dict_all[[i]] <- eval(parse(text=all_of_this[[i]]))
}
# compile into final data.frame and make sure column names are a column as well
final <- do.call(cbind, dict_all)
final <- cbind(column_name=row.names(final), final)
final$percent_blank <- round(final$count_of_blanks / nrow(df) * 100, digits = 2)
final$percent_NAs <- round(final$count_of_NA_values / nrow(df) * 100, digits = 2)
final$percent_unique <- round(final$count_of_unique_values / nrow(df) * 100, digits = 2)
print(paste0("Finished after: ", round(as.numeric(Sys.time() - t_start, units="mins"), digits = 2), " minutes"))
return(final)
}
# using the functions --------------------------------------------------------------------------------
# built-in dataset, no need for packages
dataset <- airquality
for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""} # add some blanks
# now lets test it out
my_data_dict <- func_build_data_dict(dataset)
# testing on a larger dataset --------------------------------------------------------------------------
# set this to TRUE and run this code chunk if you want to test on a larger dataset (1 Million rows)
TEST_LARGE_DATASET <- FALSE
if(TEST_LARGE_DATASET) {
dataset <- airquality
for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""} # add some blanks
# build a larger dataset
list_datasets <- list()
for(i in 1:10000) {list_datasets[[i]] <- dataset}
dataset_1M_rows <- do.call(rbind, list_datasets) # this takes a while to build
# alright, let's build the data dictionary for a much larger dataset
big_data_dictionary <- func_build_data_dict(dataset_1M_rows)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment