tjvananne/build_data_dictionary.R

## build_data_dictionary.R


# generic data dictionary creation using base-R

#' a couple notes: this could of course be done much faster using
#' third party packages, but I like to provide base-R solutions before
#' branching out into packages just in case they aren't available
#'
#' Also, this could be done in a much less verbose and modular way,
#' but I did want to also demonstrate the "Functional Programming"
#' abilities within R (although no functions were passed to functions)
#' and wanted this to be a helpful reference gist


# function definitions ---------------------------------------------------------------------


    # these are to be used on a single data frame column, preferrably in a "sapply()" call
    func_unq <- function(df_col, n_returned=5) {
        return(unique(df_col)[1:n_returned])
    }


    func_unq_ln <- function(df_col) {
        return(length(unique(df_col)))
    }


    func_NAs <- function(df_col) {
        return(sum(is.na(df_col)))
    }


    func_blanks <- function(df_col) {
        return(sum( trimws(df_col) == '', na.rm=T))
    }


# this is the wrapper function around the lower-level ones above
    func_build_data_dict <- function(df, n_returned=5) {
        t_start <- Sys.time()
                    # # for testing:
                    # df <- dataset
                    # n_returned <- 5

        # initialize some space for collection
        dict_all <- list()

        # clear current environment from all of the "this_"
        # if this is in a function, will it affect things outside the function? #YOLO
        rm(list=ls()[grepl("^this_", ls())])

        # execute all lower level funcs
        print(paste0("Gathering first ", n_returned, " unique values per column..."))
        this_unq <- data.frame(t(sapply(df, func_unq, n_returned=n_returned)), stringsAsFactors = F)
        names(this_unq) <- paste0('unique_val_', 1:ncol(this_unq))

        print("Calculating how many unique values per column...")
        this_unq_ln <- data.frame(sapply(df, func_unq_ln), stringsAsFactors = F)
        names(this_unq_ln) <- 'count_of_unique_values'

        print("Counting the 'NA' values per column...")
        this_NAs <- data.frame(sapply(df, func_NAs), stringsAsFactors = F)
        names(this_NAs) <- 'count_of_NA_values'

        print("Counting the blank values per column...")
        this_blanks <- data.frame(sapply(df, func_blanks), stringsAsFactors = F)
        names(this_blanks) <- 'count_of_blanks'

        # isolate the "this_" objects in environment, loop through and add to collection
        print("Combinding results into data dictionary...")
        all_of_this <- ls()[grepl("^this_", ls())]
        for(i in seq_along(all_of_this)) {
            dict_all[[i]] <- eval(parse(text=all_of_this[[i]]))
        }

        # compile into final data.frame and make sure column names are a column as well
        final <- do.call(cbind, dict_all)
        final <- cbind(column_name=row.names(final), final)

        final$percent_blank <- round(final$count_of_blanks / nrow(df) * 100, digits = 2)
        final$percent_NAs <- round(final$count_of_NA_values / nrow(df) * 100, digits = 2)
        final$percent_unique <- round(final$count_of_unique_values / nrow(df) * 100, digits = 2)
        print(paste0("Finished after: ", round(as.numeric(Sys.time() - t_start, units="mins"), digits = 2), " minutes"))
        return(final)
    }


# using the functions --------------------------------------------------------------------------------

    # built-in dataset, no need for packages
    dataset <- airquality
    for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""}  # add some blanks

    # now lets test it out
    my_data_dict <- func_build_data_dict(dataset)


# testing on a larger dataset --------------------------------------------------------------------------

    # set this to TRUE and run this code chunk if you want to test on a larger dataset (1 Million rows)
    TEST_LARGE_DATASET <- FALSE
    if(TEST_LARGE_DATASET) {

        dataset <- airquality
        for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""}  # add some blanks

        # build a larger dataset
        list_datasets <- list()
        for(i in 1:10000) {list_datasets[[i]] <- dataset}
        dataset_1M_rows <- do.call(rbind, list_datasets)  # this takes a while to build

        # alright, let's build the data dictionary for a much larger dataset
        big_data_dictionary <- func_build_data_dict(dataset_1M_rows)
    }


	# generic data dictionary creation using base-R

	#' a couple notes: this could of course be done much faster using
	#' third party packages, but I like to provide base-R solutions before
	#' branching out into packages just in case they aren't available
	#'
	#' Also, this could be done in a much less verbose and modular way,
	#' but I did want to also demonstrate the "Functional Programming"
	#' abilities within R (although no functions were passed to functions)
	#' and wanted this to be a helpful reference gist




	# function definitions ---------------------------------------------------------------------


	# these are to be used on a single data frame column, preferrably in a "sapply()" call
	func_unq <- function(df_col, n_returned=5) {
	return(unique(df_col)[1:n_returned])
	}



	func_unq_ln <- function(df_col) {
	return(length(unique(df_col)))
	}



	func_NAs <- function(df_col) {
	return(sum(is.na(df_col)))
	}



	func_blanks <- function(df_col) {
	return(sum( trimws(df_col) == '', na.rm=T))
	}



	# this is the wrapper function around the lower-level ones above
	func_build_data_dict <- function(df, n_returned=5) {
	t_start <- Sys.time()
	# # for testing:
	# df <- dataset
	# n_returned <- 5

	# initialize some space for collection
	dict_all <- list()

	# clear current environment from all of the "this_"
	# if this is in a function, will it affect things outside the function? #YOLO
	rm(list=ls()[grepl("^this_", ls())])

	# execute all lower level funcs
	print(paste0("Gathering first ", n_returned, " unique values per column..."))
	this_unq <- data.frame(t(sapply(df, func_unq, n_returned=n_returned)), stringsAsFactors = F)
	names(this_unq) <- paste0('unique_val_', 1:ncol(this_unq))

	print("Calculating how many unique values per column...")
	this_unq_ln <- data.frame(sapply(df, func_unq_ln), stringsAsFactors = F)
	names(this_unq_ln) <- 'count_of_unique_values'

	print("Counting the 'NA' values per column...")
	this_NAs <- data.frame(sapply(df, func_NAs), stringsAsFactors = F)
	names(this_NAs) <- 'count_of_NA_values'

	print("Counting the blank values per column...")
	this_blanks <- data.frame(sapply(df, func_blanks), stringsAsFactors = F)
	names(this_blanks) <- 'count_of_blanks'

	# isolate the "this_" objects in environment, loop through and add to collection
	print("Combinding results into data dictionary...")
	all_of_this <- ls()[grepl("^this_", ls())]
	for(i in seq_along(all_of_this)) {
	dict_all[[i]] <- eval(parse(text=all_of_this[[i]]))
	}

	# compile into final data.frame and make sure column names are a column as well
	final <- do.call(cbind, dict_all)
	final <- cbind(column_name=row.names(final), final)

	final$percent_blank <- round(final$count_of_blanks / nrow(df) * 100, digits = 2)
	final$percent_NAs <- round(final$count_of_NA_values / nrow(df) * 100, digits = 2)
	final$percent_unique <- round(final$count_of_unique_values / nrow(df) * 100, digits = 2)
	print(paste0("Finished after: ", round(as.numeric(Sys.time() - t_start, units="mins"), digits = 2), " minutes"))
	return(final)
	}




	# using the functions --------------------------------------------------------------------------------

	# built-in dataset, no need for packages
	dataset <- airquality
	for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""} # add some blanks

	# now lets test it out
	my_data_dict <- func_build_data_dict(dataset)





	# testing on a larger dataset --------------------------------------------------------------------------

	# set this to TRUE and run this code chunk if you want to test on a larger dataset (1 Million rows)
	TEST_LARGE_DATASET <- FALSE
	if(TEST_LARGE_DATASET) {

	dataset <- airquality
	for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""} # add some blanks

	# build a larger dataset
	list_datasets <- list()
	for(i in 1:10000) {list_datasets[[i]] <- dataset}
	dataset_1M_rows <- do.call(rbind, list_datasets) # this takes a while to build

	# alright, let's build the data dictionary for a much larger dataset
	big_data_dictionary <- func_build_data_dict(dataset_1M_rows)
	}