Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created August 12, 2011 03:02
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save stephenturner/1141346 to your computer and use it in GitHub Desktop.
Save stephenturner/1141346 to your computer and use it in GitHub Desktop.
Rprofile.R
# To source this file into an environment to avoid cluttering the global workspace, put this in Rprofile:
# my.env <- new.env(); sys.source("C:/PathTo/THIS_FILE.r", my.env); attach(my.env)
#-----------------------------------------------------------------------
# Load packages, set options and cwd, set up database connection
#-----------------------------------------------------------------------
## Load packages
# require(ggplot2) #plotting
# require(plyr) #data manipulation
# require(reshape) #data manipulation
# require(sqldf) #manipulate data frame with SQL
## Sets the working directory to C:/R
setwd("~/R")
## Don't show those silly significanct stars
options(show.signif.stars=FALSE)
## Do you want to automatically convert strings to factor variables in a data.frame?
options(stringsAsFactors=TRUE)
## Hard code the US repository for CRAN so it doesn't ask me every time.
r <- getOption("repos")
r["CRAN"] <- "http://cran.us.r-project.org"
options(repos = r)
rm(r)
## Some SQLite stuff I don't use any more because I switched to MySQL
# require(RSQLite)
# channel <- dbConnect(SQLite(), "C:/cygwin/home/sturner/dbs/sdt.sqlite")
# query <- function(...) dbGetQuery(channel,...)
## Set up ODBC connection for MySQL localhost, and make it easy to query a database with query() function.
require(RODBC) # The rprofile script will fail here if you don't have RODBC installed.
channel <- odbcConnect("localhost")
query <- function(...) sqlQuery(channel, ...)
#-----------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------
## Transpose a numeric data frame with ID in first column
tdf <- function(d) {
row.names(d) <- d[[1]]
d[[1]] <- NULL
d <- as.data.frame(t(d))
d$id <- row.names(d)
d <- cbind(d[ncol(d)], d[-ncol(d)])
row.names(d) <- NULL
d
}
## Convert selected columns of a data frame to factor variables
factorcols <- function(d, ...) lapply(d, function(x) factor(x, ...))
## Returns a logical vector TRUE for elements of X not in Y
"%nin%" <- function(x, y) !(x %in% y)
## Returns names(df) in single column, numbered matrix format.
n <- function(df) matrix(names(df))
## Single character shortcuts for summary() and head().
s <- base::summary
h <- utils::head
## ht==headtail, i.e., show the first and last 10 items of an object
ht <- function(d) rbind(head(d,10),tail(d,10))
## Show the first 5 rows and first 5 columns of a data frame or matrix
hh <- function(d) d[1:5,1:5]
## Read data on clipboard.
read.cb <- function(...) {
ismac <- Sys.info()[1]=="Darwin"
if (!ismac) read.table(file="clipboard", ...)
else read.table(pipe("pbpaste"), ...)
}
## Open current directory on mac
macopen <- function(...) system("open .")
## name("test.png") results in "C:/R/2010-04-20-test.png" if running this in C:/R on April 20 2010.
name <- function(filename="filename") paste(getwd(),"/",Sys.Date(),"-",filename,sep="")
## Takes a dataframe and a column name, and moves that column to the front of the DF.
moveColFront <- function(d=dataframe, colname="colname") {
index <- match(colname, names(d))
cbind(d[index],d[-index])
}
## Permutes a column in a data.frame, sets seed optionally
permute <- function (dataframe, columnToPermute="column", seed=NULL) {
if (!is.null(seed)) set.seed(seed)
colindex <- which(names(dataframe)==columnToPermute)
permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))]
dataframe[colindex] <- permutedcol
return(dataframe)
}
## Summarize missing data in a data frame. Return a list (lpropmiss) or data frame (propmiss)
lpropmiss <- function(dataframe) lapply(dataframe,function(x) data.frame(nmiss=sum(is.na(x)), n=length(x), propmiss=sum(is.na(x))/length(x)))
propmiss <- function(dataframe) {
m <- sapply(dataframe, function(x) {
data.frame(
nmiss=sum(is.na(x)),
n=length(x),
propmiss=sum(is.na(x))/length(x)
)
})
d <- data.frame(t(m))
d <- sapply(d, unlist)
d <- as.data.frame(d)
d$variable <- row.names(d)
row.names(d) <- NULL
d <- cbind(d[ncol(d)],d[-ncol(d)])
return(d[order(d$propmiss), ])
}
## Make a pretty QQ plot of p-values
qq = function(pvector, ...) {
if (!is.numeric(pvector)) stop("D'oh! P value vector is not numeric.")
pvector <- pvector[!is.na(pvector) & pvector<1 & pvector>0]
o = -log10(sort(pvector,decreasing=F))
#e = -log10( 1:length(o)/length(o) )
e = -log10( ppoints(length(pvector) ))
plot(e,o,pch=19,cex=1, xlab=expression(Expected~~-log[10](italic(p))), ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(o)), ...)
abline(0,1,col="red")
}
## Draw a histogram with normal overlay (From http://www.statmethods.net/graphs/density.html)
histnormal <- function(d, main=NULL, xlab=NULL, breaks="FD", ...) {
if (any(is.na(d))) warning(paste(sum(is.na(d)), "missing values")); d <- na.omit(d)
h <- hist(d, plot=FALSE, breaks=breaks, ...)
x <- seq(min(d), max(d), length=40)
y <- dnorm(x, mean=mean(d), sd=sd(d))
y <- y*diff(h$mids[1:2])*length(d)
hist(d, col="gray50", main=main, xlab=xlab, ylim=c(0,max(y)), breaks=breaks,...)
lines(x,y, col="blue", lwd=2)
rug(x)
}
## Draw a histogram with density overlay
histdensity <- function(x, main=NULL, breaks="FD", ...) {
if (any(is.na(x))) warning(paste(sum(is.na(x)), "missing values")); x <- na.omit(x)
hist(x, col="gray50", probability=TRUE, breaks=breaks, main=main, ...)
lines(density(x, na.rm = TRUE), col = "blue", lwd=2)
rug(x)
}
## Plot scatterplot with trendline and confidence interval (From http://tinyurl.com/3bvrth7)
scatterci <- function(x, y, ...) {
plot(x, y, ...)
mylm <- lm(y~x)
abline(mylm, col="blue")
x=sort(x)
prd<-predict(mylm,newdata=data.frame(x=x),interval = c("confidence"), level = 0.95)
lines(x,prd[,2],col="blue",lty=3)
lines(x,prd[,3],col="blue",lty=3)
}
## Get the proportion variation explained. See this website for more details: http://goo.gl/jte8X
rsq <- function(predicted, actual) 1-sum((actual-predicted)^2)/sum((actual-mean(actual))^2)
## Correlation matrix with p-values. See http://goo.gl/nahmV for documentation of this function
cor.prob <- function(X, dfr = nrow(X) - 2) {
R <- cor(X)
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr / (1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R)==col(R)]<-NA
R
}
## This function accepts a GLM object and does a LR chi-square test on the fit.
lrt <- function (modelobject) {
lrtest.chi2 <- model$null.deviance - model$deviance # Difference in deviance between model with intercept only and full model. This is the likelihood ratio test statistic (-2(log(L))).
lrtest.df <- model$df.null - model$df.residual # Difference in DF. Make sure this equals the number of predictors in the model!
fitpval <- 1-pchisq(lrtest.chi2,lrtest.df)
cat("Likelihood ratio test on model fit:\n\n")
data.frame(lrtest.chi2=lrtest.chi2,lrtest.df=lrtest.df,fitpval=fitpval) #Output gives you the chisquare, df, and p-value.
}
## This function does the same thing as lrtest in the Design package, but doesn't do as much checking.
## Remember, the lrt has to test the same model (model fit on same observations)
## Also the drop1(fullmodel,test="Chisq") does something similar.
lrt2 <- function (full,reduced) {
if (reduced$deviance<=full$deviance) stop ("Reduced model not worse than full.")
if (reduced$df.residual<=full$df.residual) stop ("Reduced model doesn't have more degrees of freedom.")
lrtest.chi2 <- reduced$deviance-full$deviance
lrtest.df <- reduced$df.residual - full$df.residual
fitpval <- 1-pchisq(lrtest.chi2,lrtest.df)
cat("Likelihood ratio test on two models:\n\n")
data.frame(lrtest.chi2=lrtest.chi2,lrtest.df=lrtest.df,fitpval=fitpval)
}
## This gets the overall anova p-value out of a linear model object
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
## Function for arranging ggplots. use png(); arrange(p1, p2, ncol=1); dev.off() to save.
vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)
arrange_ggplot2 <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {
dots <- list(...)
n <- length(dots)
if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}
if(is.null(nrow)) { nrow = ceiling(n/ncol)}
if(is.null(ncol)) { ncol = ceiling(n/nrow)}
## NOTE see n2mfrow in grDevices for possible alternative
grid.newpage()
pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )
ii.p <- 1
for(ii.row in seq(1, nrow)){
ii.table.row <- ii.row
if(as.table) {ii.table.row <- nrow - ii.table.row + 1}
for(ii.col in seq(1, ncol)){
ii.table <- ii.p
if(ii.p > n) break
print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))
ii.p <- ii.p + 1
}
}
}
## Imputes the median value of a vector, matrix, or data frame.
## Stolen from na.roughfix function in the randomForest package.
na.roughfix <- function (object=NULL, ...) {
if (class(object) == "data.frame") {
isfac <- sapply(object, is.factor)
isnum <- sapply(object, is.numeric)
if (any(!(isfac | isnum)))
stop("dfMedianImpute only works for numeric or factor")
roughfix <- function(x) {
if (any(is.na(x))) {
if (is.factor(x)) {
freq <- table(x)
x[is.na(x)] <- names(freq)[which.max(freq)]
}
else {
x[is.na(x)] <- median(x, na.rm = TRUE)
}
}
x
}
object[] <- lapply(object, roughfix)
return(object)
}
else if(is.atomic(object)) {
d <- dim(object)
if (length(d) > 2)
stop("vectorMedianImpute can't handle objects with more than two dimensions")
if (all(!is.na(object)))
return(object)
if (!is.numeric(object))
stop("vectorMedianImpute can only deal with numeric data.")
if (length(d) == 2) {
hasNA <- which(apply(object, 2, function(x) any(is.na(x))))
for (j in hasNA) object[is.na(object[, j]), j] <- median(object[,
j], na.rm = TRUE)
}
else {
object[is.na(object)] <- median(object, na.rm = TRUE)
}
return(object)
}
else stop("Object is not a data frame or atomic vector")
}
## Makes a better scatterplot matrix.
## Stolen from the PerformanceAnalytics package: http://cran.r-project.org/web/packages/PerformanceAnalytics/index.html
## Also see http://moderntoolmaking.blogspot.com/2011/08/graphically-analyzing-variable.html
## To color code points based on levels of a factor, use these args:
## pairs.perfan(d, bg=c("red","blue")[d$factor], pch=21)
betterpairs <- function (R, histogram = TRUE, ...)
{
x=as.matrix(R) # in PerformanceAnalytics: x = checkData(R, method = "matrix")
if (mode(x)!="numeric") stop("Must pass in only numeric values")
panel.cor <- function(x, y, digits = 2, prefix = "", use = "pairwise.complete.obs", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use = use))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste(prefix, txt, sep = "")
if (missing(cex.cor)) cex <- 0.8/strwidth(txt)
test <- cor.test(x, y)
Signif <- symnum(test$p.value, corr = FALSE, na = FALSE, cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = cex * r)
text(0.8, 0.8, Signif, cex = cex, col = 2)
}
f <- function(t) dnorm(t, mean = mean(x), sd = sd(x))
# Useful function for histogram showing density overlay and rug
hist.panel = function(x, ...) {
par(new = TRUE)
hist(x, col = "light gray", probability = TRUE, axes = FALSE, main = "", breaks = "FD")
lines(density(x, na.rm = TRUE), col = "red", lwd = 1)
rug(x)
}
if (histogram) pairs(x, gap = 0, lower.panel = panel.smooth, upper.panel = panel.cor, diag.panel = hist.panel, ...)
else pairs(x, gap = 0, lower.panel = panel.smooth, upper.panel = panel.cor, ...)
}
# Did you make it this far?
message("\n******************************\nSuccessfully loaded Rprofile.r\n******************************")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment