nmatzke/data_frame_utilities_v1.R

## data_frame_utilities_v1.R
#######################################################
# These functions are some utilitys for dealing with
# annoying features of R's use of data.frames, e.g.
# the tendency for some functions to have something that
# looks like a data.frame, but is actually a series of lists.
#
# Also, dfnums_to_numeric semi-intelligently converts
# columns with numeric data to numeric (rather than
# factor or character).
#######################################################


# Utility function #1 - ignore
adf2 <- function(x)
	{
	# Deals with the problem of repeated row names
	rownames = 1:nrow(x)
	return(as.data.frame(x, row.names=rownames, stringsAsFactors=FALSE))
	}

# Utility function #2 - ignore
unlist_df4 <- function(df, ...)
	{
	store_colnames = names(df)
	store_rownames = rownames(df)

	outdf = NULL

	numrows = dim(df)[1]
	numcols = dim(df)[2]

	for (i in 1:ncol(df))
		{
		#print(names(df)[i])
		tmpcol = unlist(df[, i])
		#print(length(tmpcol))

		# Error check; e.g. blank cells might screw it up
		if (length(tmpcol) < numrows)
			{
			tmpcol2 = df[,i]
			tmpcol = as.character(unlist(tmpcol2))
			}

		outdf = cbind(outdf, tmpcol)
		}

	# Unlist each row
	outdf_tmp = adf2(outdf)

	# Remove factors and character silliness from numbers
	outdf = dfnums_to_numeric(outdf_tmp, ...)

	names(outdf) = store_colnames
	rownames(outdf) = store_rownames
	return(outdf)
	}

# Utility function #3 - ignore

dfnums_to_numeric <- function (dtf, max_NAs = 0.5, printout = FALSE, roundval = NULL)
{
    dtf_classes = cls.df(dtf, printout = FALSE)
    dtf_names = names(dtf)
    numcols = ncol(dtf)
    cls_col_list = c()
    for (i in 1:numcols) {
        cls_col = NA
        cmdstr = paste("cls_col = class(dtf$'", dtf_names[i],
            "')", sep = "")
        eval(parse(text = cmdstr))
        cls_col_list[i] = cls_col
    }
    for (i in 1:numcols) {
        if (cls_col_list[i] == "list") {
            (next)()
        }
        if (cls_col_list[i] != "numeric") {
            newcol = NA
            cmdstr = paste("newcol = as.numeric(as.character(dtf$'",
                dtf_names[i], "'))", sep = "")
            suppressWarnings(eval(parse(text = cmdstr)))
            if (sum(is.na(newcol)) < (max_NAs * length(newcol))) {
                cmdstr = paste("dtf$'", dtf_names[i], "' = newcol",
                  sep = "")
                suppressWarnings(eval(parse(text = cmdstr)))
                if (!is.null(roundval)) {
                  cmdstr = paste("dtf$'", dtf_names[i], "' = round(dtf$'",
                    dtf_names[i], "', digits=roundval)", sep = "")
                  suppressWarnings(eval(parse(text = cmdstr)))
                }
            }
        }
    }
    tmp_classes = cls.df(dtf)
    dtf_classes$newclasses = tmp_classes[, ncol(tmp_classes)]
    if (printout) {
        cat("\n")
        cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ",
            nrow(dtf_classes), " rows, ", ncol(dtf_classes),
            " columns.\n", sep = "")
        cat("...names() and classes() of each column below...\n",
            sep = "")
        cat("\n")
        print(dtf_classes)
    }
    return(dtf)
}

cls.df <- function (dtf, printout = FALSE)
{
    if (class(dtf) == "matrix") {
        dtf = as.data.frame(dtf, stringsAsFactors = FALSE)
    }
    dtf_names = names(dtf)
    numcols = ncol(dtf)
    cls_col_list = c()
    for (i in 1:numcols) {
        cls_col = NA
        cmdstr = paste("cls_col = class(dtf$'", dtf_names[i],
            "')", sep = "")
        eval(parse(text = cmdstr))
        cls_col_list[i] = cls_col
    }
    colnum = 1:numcols
    dtf_classes = cbind(colnum, dtf_names, cls_col_list)
    dtf_classes = data.frame(dtf_classes, row.names = colnum)
    if (printout) {
        cat("\n")
        cat("cls.df(dtf) reports: dataframe 'dtf' has ", nrow(dtf),
            " rows, ", numcols, " columns.\n", sep = "")
        cat("...names() and classes() of each column below...\n",
            sep = "")
        cat("\n")
        print(dtf_classes)
        cat("\n")
    }
    return(dtf_classes)
}
	#######################################################
	# These functions are some utilitys for dealing with
	# annoying features of R's use of data.frames, e.g.
	# the tendency for some functions to have something that
	# looks like a data.frame, but is actually a series of lists.
	#
	# Also, dfnums_to_numeric semi-intelligently converts
	# columns with numeric data to numeric (rather than
	# factor or character).
	#######################################################



	# Utility function #1 - ignore
	adf2 <- function(x)
	{
	# Deals with the problem of repeated row names
	rownames = 1:nrow(x)
	return(as.data.frame(x, row.names=rownames, stringsAsFactors=FALSE))
	}

	# Utility function #2 - ignore
	unlist_df4 <- function(df, ...)
	{
	store_colnames = names(df)
	store_rownames = rownames(df)

	outdf = NULL

	numrows = dim(df)[1]
	numcols = dim(df)[2]

	for (i in 1:ncol(df))
	{
	#print(names(df)[i])
	tmpcol = unlist(df[, i])
	#print(length(tmpcol))

	# Error check; e.g. blank cells might screw it up
	if (length(tmpcol) < numrows)
	{
	tmpcol2 = df[,i]
	tmpcol = as.character(unlist(tmpcol2))
	}

	outdf = cbind(outdf, tmpcol)
	}

	# Unlist each row
	outdf_tmp = adf2(outdf)

	# Remove factors and character silliness from numbers
	outdf = dfnums_to_numeric(outdf_tmp, ...)

	names(outdf) = store_colnames
	rownames(outdf) = store_rownames
	return(outdf)
	}

	# Utility function #3 - ignore

	dfnums_to_numeric <- function (dtf, max_NAs = 0.5, printout = FALSE, roundval = NULL)
	{
	dtf_classes = cls.df(dtf, printout = FALSE)
	dtf_names = names(dtf)
	numcols = ncol(dtf)
	cls_col_list = c()
	for (i in 1:numcols) {
	cls_col = NA
	cmdstr = paste("cls_col = class(dtf$'", dtf_names[i],
	"')", sep = "")
	eval(parse(text = cmdstr))
	cls_col_list[i] = cls_col
	}
	for (i in 1:numcols) {
	if (cls_col_list[i] == "list") {
	(next)()
	}
	if (cls_col_list[i] != "numeric") {
	newcol = NA
	cmdstr = paste("newcol = as.numeric(as.character(dtf$'",
	dtf_names[i], "'))", sep = "")
	suppressWarnings(eval(parse(text = cmdstr)))
	if (sum(is.na(newcol)) < (max_NAs * length(newcol))) {
	cmdstr = paste("dtf$'", dtf_names[i], "' = newcol",
	sep = "")
	suppressWarnings(eval(parse(text = cmdstr)))
	if (!is.null(roundval)) {
	cmdstr = paste("dtf$'", dtf_names[i], "' = round(dtf$'",
	dtf_names[i], "', digits=roundval)", sep = "")
	suppressWarnings(eval(parse(text = cmdstr)))
	}
	}
	}
	}
	tmp_classes = cls.df(dtf)
	dtf_classes$newclasses = tmp_classes[, ncol(tmp_classes)]
	if (printout) {
	cat("\n")
	cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ",
	nrow(dtf_classes), " rows, ", ncol(dtf_classes),
	" columns.\n", sep = "")
	cat("...names() and classes() of each column below...\n",
	sep = "")
	cat("\n")
	print(dtf_classes)
	}
	return(dtf)
	}

	cls.df <- function (dtf, printout = FALSE)
	{
	if (class(dtf) == "matrix") {
	dtf = as.data.frame(dtf, stringsAsFactors = FALSE)
	}
	dtf_names = names(dtf)
	numcols = ncol(dtf)
	cls_col_list = c()
	for (i in 1:numcols) {
	cls_col = NA
	cmdstr = paste("cls_col = class(dtf$'", dtf_names[i],
	"')", sep = "")
	eval(parse(text = cmdstr))
	cls_col_list[i] = cls_col
	}
	colnum = 1:numcols
	dtf_classes = cbind(colnum, dtf_names, cls_col_list)
	dtf_classes = data.frame(dtf_classes, row.names = colnum)
	if (printout) {
	cat("\n")
	cat("cls.df(dtf) reports: dataframe 'dtf' has ", nrow(dtf),
	" rows, ", numcols, " columns.\n", sep = "")
	cat("...names() and classes() of each column below...\n",
	sep = "")
	cat("\n")
	print(dtf_classes)
	cat("\n")
	}
	return(dtf_classes)
	}