Skip to content

Instantly share code, notes, and snippets.

@nmatzke
Created December 22, 2016 04:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmatzke/99617e9b994c4d66d786af3ee057413c to your computer and use it in GitHub Desktop.
Save nmatzke/99617e9b994c4d66d786af3ee057413c to your computer and use it in GitHub Desktop.
Utilities to manage annoying features of R data frames (list, factor issues)
#######################################################
# These functions are some utilitys for dealing with
# annoying features of R's use of data.frames, e.g.
# the tendency for some functions to have something that
# looks like a data.frame, but is actually a series of lists.
#
# Also, dfnums_to_numeric semi-intelligently converts
# columns with numeric data to numeric (rather than
# factor or character).
#######################################################
# Utility function #1 - ignore
adf2 <- function(x)
{
# Deals with the problem of repeated row names
rownames = 1:nrow(x)
return(as.data.frame(x, row.names=rownames, stringsAsFactors=FALSE))
}
# Utility function #2 - ignore
unlist_df4 <- function(df, ...)
{
store_colnames = names(df)
store_rownames = rownames(df)
outdf = NULL
numrows = dim(df)[1]
numcols = dim(df)[2]
for (i in 1:ncol(df))
{
#print(names(df)[i])
tmpcol = unlist(df[, i])
#print(length(tmpcol))
# Error check; e.g. blank cells might screw it up
if (length(tmpcol) < numrows)
{
tmpcol2 = df[,i]
tmpcol = as.character(unlist(tmpcol2))
}
outdf = cbind(outdf, tmpcol)
}
# Unlist each row
outdf_tmp = adf2(outdf)
# Remove factors and character silliness from numbers
outdf = dfnums_to_numeric(outdf_tmp, ...)
names(outdf) = store_colnames
rownames(outdf) = store_rownames
return(outdf)
}
# Utility function #3 - ignore
dfnums_to_numeric <- function (dtf, max_NAs = 0.5, printout = FALSE, roundval = NULL)
{
dtf_classes = cls.df(dtf, printout = FALSE)
dtf_names = names(dtf)
numcols = ncol(dtf)
cls_col_list = c()
for (i in 1:numcols) {
cls_col = NA
cmdstr = paste("cls_col = class(dtf$'", dtf_names[i],
"')", sep = "")
eval(parse(text = cmdstr))
cls_col_list[i] = cls_col
}
for (i in 1:numcols) {
if (cls_col_list[i] == "list") {
(next)()
}
if (cls_col_list[i] != "numeric") {
newcol = NA
cmdstr = paste("newcol = as.numeric(as.character(dtf$'",
dtf_names[i], "'))", sep = "")
suppressWarnings(eval(parse(text = cmdstr)))
if (sum(is.na(newcol)) < (max_NAs * length(newcol))) {
cmdstr = paste("dtf$'", dtf_names[i], "' = newcol",
sep = "")
suppressWarnings(eval(parse(text = cmdstr)))
if (!is.null(roundval)) {
cmdstr = paste("dtf$'", dtf_names[i], "' = round(dtf$'",
dtf_names[i], "', digits=roundval)", sep = "")
suppressWarnings(eval(parse(text = cmdstr)))
}
}
}
}
tmp_classes = cls.df(dtf)
dtf_classes$newclasses = tmp_classes[, ncol(tmp_classes)]
if (printout) {
cat("\n")
cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ",
nrow(dtf_classes), " rows, ", ncol(dtf_classes),
" columns.\n", sep = "")
cat("...names() and classes() of each column below...\n",
sep = "")
cat("\n")
print(dtf_classes)
}
return(dtf)
}
cls.df <- function (dtf, printout = FALSE)
{
if (class(dtf) == "matrix") {
dtf = as.data.frame(dtf, stringsAsFactors = FALSE)
}
dtf_names = names(dtf)
numcols = ncol(dtf)
cls_col_list = c()
for (i in 1:numcols) {
cls_col = NA
cmdstr = paste("cls_col = class(dtf$'", dtf_names[i],
"')", sep = "")
eval(parse(text = cmdstr))
cls_col_list[i] = cls_col
}
colnum = 1:numcols
dtf_classes = cbind(colnum, dtf_names, cls_col_list)
dtf_classes = data.frame(dtf_classes, row.names = colnum)
if (printout) {
cat("\n")
cat("cls.df(dtf) reports: dataframe 'dtf' has ", nrow(dtf),
" rows, ", numcols, " columns.\n", sep = "")
cat("...names() and classes() of each column below...\n",
sep = "")
cat("\n")
print(dtf_classes)
cat("\n")
}
return(dtf_classes)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment