Created
December 22, 2016 04:45
-
-
Save nmatzke/99617e9b994c4d66d786af3ee057413c to your computer and use it in GitHub Desktop.
Utilities to manage annoying features of R data frames (list, factor issues)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################### | |
# These functions are some utilitys for dealing with | |
# annoying features of R's use of data.frames, e.g. | |
# the tendency for some functions to have something that | |
# looks like a data.frame, but is actually a series of lists. | |
# | |
# Also, dfnums_to_numeric semi-intelligently converts | |
# columns with numeric data to numeric (rather than | |
# factor or character). | |
####################################################### | |
# Utility function #1 - ignore | |
adf2 <- function(x) | |
{ | |
# Deals with the problem of repeated row names | |
rownames = 1:nrow(x) | |
return(as.data.frame(x, row.names=rownames, stringsAsFactors=FALSE)) | |
} | |
# Utility function #2 - ignore | |
unlist_df4 <- function(df, ...) | |
{ | |
store_colnames = names(df) | |
store_rownames = rownames(df) | |
outdf = NULL | |
numrows = dim(df)[1] | |
numcols = dim(df)[2] | |
for (i in 1:ncol(df)) | |
{ | |
#print(names(df)[i]) | |
tmpcol = unlist(df[, i]) | |
#print(length(tmpcol)) | |
# Error check; e.g. blank cells might screw it up | |
if (length(tmpcol) < numrows) | |
{ | |
tmpcol2 = df[,i] | |
tmpcol = as.character(unlist(tmpcol2)) | |
} | |
outdf = cbind(outdf, tmpcol) | |
} | |
# Unlist each row | |
outdf_tmp = adf2(outdf) | |
# Remove factors and character silliness from numbers | |
outdf = dfnums_to_numeric(outdf_tmp, ...) | |
names(outdf) = store_colnames | |
rownames(outdf) = store_rownames | |
return(outdf) | |
} | |
# Utility function #3 - ignore | |
dfnums_to_numeric <- function (dtf, max_NAs = 0.5, printout = FALSE, roundval = NULL) | |
{ | |
dtf_classes = cls.df(dtf, printout = FALSE) | |
dtf_names = names(dtf) | |
numcols = ncol(dtf) | |
cls_col_list = c() | |
for (i in 1:numcols) { | |
cls_col = NA | |
cmdstr = paste("cls_col = class(dtf$'", dtf_names[i], | |
"')", sep = "") | |
eval(parse(text = cmdstr)) | |
cls_col_list[i] = cls_col | |
} | |
for (i in 1:numcols) { | |
if (cls_col_list[i] == "list") { | |
(next)() | |
} | |
if (cls_col_list[i] != "numeric") { | |
newcol = NA | |
cmdstr = paste("newcol = as.numeric(as.character(dtf$'", | |
dtf_names[i], "'))", sep = "") | |
suppressWarnings(eval(parse(text = cmdstr))) | |
if (sum(is.na(newcol)) < (max_NAs * length(newcol))) { | |
cmdstr = paste("dtf$'", dtf_names[i], "' = newcol", | |
sep = "") | |
suppressWarnings(eval(parse(text = cmdstr))) | |
if (!is.null(roundval)) { | |
cmdstr = paste("dtf$'", dtf_names[i], "' = round(dtf$'", | |
dtf_names[i], "', digits=roundval)", sep = "") | |
suppressWarnings(eval(parse(text = cmdstr))) | |
} | |
} | |
} | |
} | |
tmp_classes = cls.df(dtf) | |
dtf_classes$newclasses = tmp_classes[, ncol(tmp_classes)] | |
if (printout) { | |
cat("\n") | |
cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ", | |
nrow(dtf_classes), " rows, ", ncol(dtf_classes), | |
" columns.\n", sep = "") | |
cat("...names() and classes() of each column below...\n", | |
sep = "") | |
cat("\n") | |
print(dtf_classes) | |
} | |
return(dtf) | |
} | |
cls.df <- function (dtf, printout = FALSE) | |
{ | |
if (class(dtf) == "matrix") { | |
dtf = as.data.frame(dtf, stringsAsFactors = FALSE) | |
} | |
dtf_names = names(dtf) | |
numcols = ncol(dtf) | |
cls_col_list = c() | |
for (i in 1:numcols) { | |
cls_col = NA | |
cmdstr = paste("cls_col = class(dtf$'", dtf_names[i], | |
"')", sep = "") | |
eval(parse(text = cmdstr)) | |
cls_col_list[i] = cls_col | |
} | |
colnum = 1:numcols | |
dtf_classes = cbind(colnum, dtf_names, cls_col_list) | |
dtf_classes = data.frame(dtf_classes, row.names = colnum) | |
if (printout) { | |
cat("\n") | |
cat("cls.df(dtf) reports: dataframe 'dtf' has ", nrow(dtf), | |
" rows, ", numcols, " columns.\n", sep = "") | |
cat("...names() and classes() of each column below...\n", | |
sep = "") | |
cat("\n") | |
print(dtf_classes) | |
cat("\n") | |
} | |
return(dtf_classes) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment