Stephen Turner stephenturner

## qqbase.r
# Originally posted at http://gettinggeneticsdone.blogspot.com/2010/07/qq-plots-of-p-values-in-r-using-base.html

# Define the function
ggd.qqplot = function(pvector, main=NULL, ...) {
    o = -log10(sort(pvector,decreasing=F))
    e = -log10( 1:length(o)/length(o) )
    plot(e,o,pch=19,cex=1, main=main, ...,
        xlab=expression(Expected~~-log[10](italic(p))),
        ylab=expression(Observed~~-log[10](italic(p))),
        xlim=c(0,max(e)), ylim=c(0,max(o)))

## logisticcurve.r
x=seq(-5,5,.01)
invlogit=function(x) exp(x)/(1+exp(x))
y=invlogit(x)
plot(x,y,pch=16,ylab=expression(paste(logit^{-1},(x))))
abline(v=0)
abline(h=.5)
text(.55,.55,expression(paste("Slope is ",beta/4)),adj=c(0,0))

## pvalue-from-lm-object.r
# Function to extract the overall ANOVA p-value out of a linear model object
lmp <- function (modelobject) {
	if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
	f <- summary(modelobject)$fstatistic
	p <- pf(f[1],f[2],f[3],lower.tail=F)
	attributes(p) <- NULL
	return(p)
}

# simulate some data

## forestplot.r
# d is a data frame with 4 columns
# d$x gives variable names
# d$y gives center point
# d$ylo gives lower limits
# d$yhi gives upper limits
forestplot <- function(d, xlab="Odds Ratio", ylab="Study"){
    require(ggplot2)
    p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi)) +
		geom_pointrange() +
		coord_flip() +

## randomforestdemo.r
rm(list=ls(all=TRUE))
library(randomForest)

###############Classification################
data(iris)
head(iris)
iris.rf <- randomForest(Species~., data=iris, importance=T, proximity=T)
iris.rf.subset <- randomForest(Species~., data=iris[c(1:3,5)], importance=T, proximity=T)
iris.rf.subset2 <- randomForest(Species~. -Petal.Length -Petal.Width, data=iris, importance=T, proximity=T)
print(iris.rf)

## 2011-02-15 rf adiposity.r
library(randomForest)

###############################################################################
############################## load functions #################################
###############################################################################

# need to document this!
rfr2 = function(randomForestModel) {
	printoutput = capture.output(print(randomForestModel))
	varline = grep("explained",printoutput,value=TRUE)

## permute_column.r
# permutes a column in a data.frame, sets seed optionally
permute <- function (dataframe, columnToPermute="column", seed=NULL) {
	if (!is.null(seed)) set.seed(seed)
	colindex <- which(names(dataframe)==columnToPermute)
	permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))]
	dataframe[colindex] <- permutedcol
	return(dataframe)
}

## ggd_rf_example.r
#load the iris data
data(iris)

# this data has 150 rows
nrow(iris)

# look at the first few
head(iris)

# splitdf function will return a list of training and testing sets

## splitdf.randomize.r
#splitdf splits a data frame into a training and testing set.
#returns a list of two data frames: trainset and testset.
#you can optionally apply a random seed.
splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]

## propmiss.r
propmiss <- function(dataframe) {
	m <- sapply(dataframe, function(x) {
		data.frame(
			nmiss=sum(is.na(x)),
			n=length(x),
			propmiss=sum(is.na(x))/length(x)
		)
	})
	d <- data.frame(t(m))
	d <- sapply(d, unlist)
	# Originally posted at http://gettinggeneticsdone.blogspot.com/2010/07/qq-plots-of-p-values-in-r-using-base.html

	# Define the function
	ggd.qqplot = function(pvector, main=NULL, ...) {
	o = -log10(sort(pvector,decreasing=F))
	e = -log10( 1:length(o)/length(o) )
	plot(e,o,pch=19,cex=1, main=main, ...,
	xlab=expression(Expected~~-log[10](italic(p))),
	ylab=expression(Observed~~-log[10](italic(p))),
	xlim=c(0,max(e)), ylim=c(0,max(o)))
	x=seq(-5,5,.01)
	invlogit=function(x) exp(x)/(1+exp(x))
	y=invlogit(x)
	plot(x,y,pch=16,ylab=expression(paste(logit^{-1},(x))))
	abline(v=0)
	abline(h=.5)
	text(.55,.55,expression(paste("Slope is ",beta/4)),adj=c(0,0))
	# Function to extract the overall ANOVA p-value out of a linear model object
	lmp <- function (modelobject) {
	if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
	f <- summary(modelobject)$fstatistic
	p <- pf(f[1],f[2],f[3],lower.tail=F)
	attributes(p) <- NULL
	return(p)
	}

	# simulate some data
	# d is a data frame with 4 columns
	# d$x gives variable names
	# d$y gives center point
	# d$ylo gives lower limits
	# d$yhi gives upper limits
	forestplot <- function(d, xlab="Odds Ratio", ylab="Study"){
	require(ggplot2)
	p <- ggplot(d, aes(x=x, y=y, ymin=ylo, ymax=yhi)) +
	geom_pointrange() +
	coord_flip() +
	rm(list=ls(all=TRUE))
	library(randomForest)

	###############Classification################
	data(iris)
	head(iris)
	iris.rf <- randomForest(Species~., data=iris, importance=T, proximity=T)
	iris.rf.subset <- randomForest(Species~., data=iris[c(1:3,5)], importance=T, proximity=T)
	iris.rf.subset2 <- randomForest(Species~. -Petal.Length -Petal.Width, data=iris, importance=T, proximity=T)
	print(iris.rf)
	library(randomForest)

	###############################################################################
	############################## load functions #################################
	###############################################################################

	# need to document this!
	rfr2 = function(randomForestModel) {
	printoutput = capture.output(print(randomForestModel))
	varline = grep("explained",printoutput,value=TRUE)
	# permutes a column in a data.frame, sets seed optionally
	permute <- function (dataframe, columnToPermute="column", seed=NULL) {
	if (!is.null(seed)) set.seed(seed)
	colindex <- which(names(dataframe)==columnToPermute)
	permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))]
	dataframe[colindex] <- permutedcol
	return(dataframe)
	}
	#load the iris data
	data(iris)

	# this data has 150 rows
	nrow(iris)

	# look at the first few
	head(iris)

	# splitdf function will return a list of training and testing sets
	#splitdf splits a data frame into a training and testing set.
	#returns a list of two data frames: trainset and testset.
	#you can optionally apply a random seed.
	splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 \| trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]
	propmiss <- function(dataframe) {
	m <- sapply(dataframe, function(x) {
	data.frame(
	nmiss=sum(is.na(x)),
	n=length(x),
	propmiss=sum(is.na(x))/length(x)
	)
	})
	d <- data.frame(t(m))
	d <- sapply(d, unlist)