Skip to content

Instantly share code, notes, and snippets.

@digdeep
Created May 26, 2014 13:54
Show Gist options
  • Save digdeep/b3bf591a2d5e8e026b87 to your computer and use it in GitHub Desktop.
Save digdeep/b3bf591a2d5e8e026b87 to your computer and use it in GitHub Desktop.
Functions for working with data in R
setwd("/YOUR/WORKING/DIRECTORY")
file_list <- list.files()
## for individual files
dataset <- lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")})
## If your CSV column structure is same across all csv's
dataset <- do.call("rbind",lapply(file_list,FUN=function(files){read.table(files,header=TRUE, sep=",")}))
temp <- list.files(pattern = "*.csv")
## for individual files
dataset <- lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")})
dataset[1] ## for specific files of interest, OR
## If your CSV column structure is same across all csv's bind them all into 1 file
dataset <- do.call("rbind",lapply(temp,FUN=function(files){read.table(files,header=TRUE, sep=",")}))
NAsummary = function (df, include.nan = FALSE)
{
newdf = data.frame(col = 1:ncol(df), Count = nrow(df), nNA = sapply(df,
function(x) length(x[is.na(x)])))
newdf$rNA = newdf$nNA/newdf$Count
newdf$rNA = trunc(newdf$rNA * 10000)/10000
if (include.nan) {
newdf$nNan = sapply(df, function(x) length(x[is.nan(x)]))
newdf$rNan = newdf$nNan/newdf$Count
newdf$rNan = trunc(newdf$rNan * 10000)/10000
}
newdf$nUnique = sapply(df, function(x) length(unique(x)))
newdf$rUnique = newdf$nUnique/newdf$Count
newdf$rUnique = trunc(newdf$rUnique * 10000)/10000
rownames(newdf) = colnames(df)
return(newdf)
}
spread <- function(x) {
n <- length(x)
n.med <- (n + 1)/2
n.fourth <- (floor(n.med) + 1)/2
y <- sort(x)[c(floor(n.fourth), ceiling(n.fourth),
floor(n+1 - n.fourth), ceiling(n+1 - n.fourth))]
return( y %*% c(-1,-1,1,1)/2 )
}
years <- floor((1:length(x) - 1) / 12)
z <- split(x, years)
boxplot(z, names=(min(years):max(years))+2010, ylab="y")
#Spread VS Lever Plot
z.med <- unlist(lapply(z, median))
z.spread <- unlist(lapply(z, spread))
fit <- lm(log(z.spread) ~ log(z.med))
plot(log(z.med), log(z.spread), xlab="Log Level", ylab="Log Spread",
main="Spread vs. Level Plot")
abline(fit, lwd=2, col="Red")
#LAMBDA
lambda <- 1 - coef(fit)[2]
boxplot(lapply(z, function(u) u^lambda), names=(min(years):max(years))+2010,
ylab=paste("y^", round(lambda, 2), sep=""),
main="Boxplots of Re-expressed Values")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment