digdeep/NASummary.R

## multiFile.R
setwd("/YOUR/WORKING/DIRECTORY")
file_list <- list.files()
## for individual files
dataset <- lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")})
## If your CSV column structure is same across all csv's
dataset <- do.call("rbind",lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")}))


temp <- list.files(pattern = "*.csv")
## for individual files
dataset <- lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")})
dataset[1] ## for specific files of interest, OR
## If your CSV column structure is same across all csv's bind them all into 1 file
dataset <- do.call("rbind",lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")}))

## NASummary.R
NAsummary = function (df, include.nan = FALSE)
{
  newdf = data.frame(col = 1:ncol(df), Count = nrow(df), nNA = sapply(df,
                                                                      function(x) length(x[is.na(x)])))
  newdf$rNA = newdf$nNA/newdf$Count
  newdf$rNA = trunc(newdf$rNA * 10000)/10000
  if (include.nan) {
    newdf$nNan = sapply(df, function(x) length(x[is.nan(x)]))
    newdf$rNan = newdf$nNan/newdf$Count
    newdf$rNan = trunc(newdf$rNan * 10000)/10000
  }
  newdf$nUnique = sapply(df, function(x) length(unique(x)))
  newdf$rUnique = newdf$nUnique/newdf$Count
  newdf$rUnique = trunc(newdf$rUnique * 10000)/10000
  rownames(newdf) = colnames(df)
  return(newdf)
}

## transform.R
spread <- function(x) {
  n <- length(x)
  n.med <- (n + 1)/2
  n.fourth <- (floor(n.med) + 1)/2
  y <- sort(x)[c(floor(n.fourth), ceiling(n.fourth),
                 floor(n+1 - n.fourth), ceiling(n+1 - n.fourth))]
  return( y %*% c(-1,-1,1,1)/2 )
}
years <- floor((1:length(x) - 1) / 12)
z <- split(x, years)
boxplot(z, names=(min(years):max(years))+2010, ylab="y")
#Spread VS Lever Plot
z.med <- unlist(lapply(z, median))
z.spread <- unlist(lapply(z, spread))
fit <- lm(log(z.spread) ~ log(z.med))
plot(log(z.med), log(z.spread), xlab="Log Level", ylab="Log Spread",
     main="Spread vs. Level Plot")
abline(fit, lwd=2, col="Red")
#LAMBDA
lambda <- 1 - coef(fit)[2]
boxplot(lapply(z, function(u) u^lambda), names=(min(years):max(years))+2010,
        ylab=paste("y^", round(lambda, 2), sep=""),
        main="Boxplots of Re-expressed Values")
	setwd("/YOUR/WORKING/DIRECTORY")
	file_list <- list.files()
	## for individual files
	dataset <- lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")})
	## If your CSV column structure is same across all csv's
	dataset <- do.call("rbind",lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")}))



	temp <- list.files(pattern = "*.csv")
	## for individual files
	dataset <- lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")})
	dataset[1] ## for specific files of interest, OR
	## If your CSV column structure is same across all csv's bind them all into 1 file
	dataset <- do.call("rbind",lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")}))
	NAsummary = function (df, include.nan = FALSE)
	{
	newdf = data.frame(col = 1:ncol(df), Count = nrow(df), nNA = sapply(df,
	function(x) length(x[is.na(x)])))
	newdf$rNA = newdf$nNA/newdf$Count
	newdf$rNA = trunc(newdf$rNA * 10000)/10000
	if (include.nan) {
	newdf$nNan = sapply(df, function(x) length(x[is.nan(x)]))
	newdf$rNan = newdf$nNan/newdf$Count
	newdf$rNan = trunc(newdf$rNan * 10000)/10000
	}
	newdf$nUnique = sapply(df, function(x) length(unique(x)))
	newdf$rUnique = newdf$nUnique/newdf$Count
	newdf$rUnique = trunc(newdf$rUnique * 10000)/10000
	rownames(newdf) = colnames(df)
	return(newdf)
	}
	spread <- function(x) {
	n <- length(x)
	n.med <- (n + 1)/2
	n.fourth <- (floor(n.med) + 1)/2
	y <- sort(x)[c(floor(n.fourth), ceiling(n.fourth),
	floor(n+1 - n.fourth), ceiling(n+1 - n.fourth))]
	return( y %*% c(-1,-1,1,1)/2 )
	}
	years <- floor((1:length(x) - 1) / 12)
	z <- split(x, years)
	boxplot(z, names=(min(years):max(years))+2010, ylab="y")
	#Spread VS Lever Plot
	z.med <- unlist(lapply(z, median))
	z.spread <- unlist(lapply(z, spread))
	fit <- lm(log(z.spread) ~ log(z.med))
	plot(log(z.med), log(z.spread), xlab="Log Level", ylab="Log Spread",
	main="Spread vs. Level Plot")
	abline(fit, lwd=2, col="Red")
	#LAMBDA
	lambda <- 1 - coef(fit)[2]
	boxplot(lapply(z, function(u) u^lambda), names=(min(years):max(years))+2010,
	ylab=paste("y^", round(lambda, 2), sep=""),
	main="Boxplots of Re-expressed Values")