Stephen Turner stephenturner

## plink_commands.txt
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list001 --out data.sub.1.1
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list002 --out data.sub.1.2
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list003 --out data.sub.1.3
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list004 --out data.sub.1.4
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list002 tmp.list002 --out data.sub.2.2
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list002 tmp.list003 --out data.sub.2.3
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list002 tmp.list004 --out data.sub.2.4
plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list003 tmp.list003 --out data.sub.3.3
plink --bfile basename --read-fr

## parallelize_plink_ibd.pl
#!/usr/bin/perl
#
#Parses a .fam file and generates N number of sample list files depending on 2nd argument
#Generates PBS cluster submission files
#Justin Giles, Vanderbilt University, 2010
#
use strict;

if(scalar(@ARGV) != 3){
	print "\nPLEASE NOTE: EDIT THIS SCRIPT IN THE PBS AREA TO MODIFY IT TO YOUR NEEDS\n";

## submit_ibd_pbs.pl
#!/usr/bin/perl

use strict;

if(scalar(@ARGV) != 1){
	print "submit_ibd_pbs.pl <num_pbs_to_submit>\n";
	exit;
}

my $LIMIT = $ARGV[0];

## qqbase.r
# Originally posted at http://gettinggeneticsdone.blogspot.com/2010/07/qq-plots-of-p-values-in-r-using-base.html

# Define the function
ggd.qqplot = function(pvector, main=NULL, ...) {
    o = -log10(sort(pvector,decreasing=F))
    e = -log10( 1:length(o)/length(o) )
    plot(e,o,pch=19,cex=1, main=main, ...,
        xlab=expression(Expected~~-log[10](italic(p))),
        ylab=expression(Observed~~-log[10](italic(p))),
        xlim=c(0,max(e)), ylim=c(0,max(o)))

## logisticcurve.r
x=seq(-5,5,.01)
invlogit=function(x) exp(x)/(1+exp(x))
y=invlogit(x)
plot(x,y,pch=16,ylab=expression(paste(logit^{-1},(x))))
abline(v=0)
abline(h=.5)
text(.55,.55,expression(paste("Slope is ",beta/4)),adj=c(0,0))

## randomforestdemo.r
rm(list=ls(all=TRUE))
library(randomForest)

###############Classification################
data(iris)
head(iris)
iris.rf <- randomForest(Species~., data=iris, importance=T, proximity=T)
iris.rf.subset <- randomForest(Species~., data=iris[c(1:3,5)], importance=T, proximity=T)
iris.rf.subset2 <- randomForest(Species~. -Petal.Length -Petal.Width, data=iris, importance=T, proximity=T)
print(iris.rf)

## 2011-02-15 rf adiposity.r
library(randomForest)

###############################################################################
############################## load functions #################################
###############################################################################

# need to document this!
rfr2 = function(randomForestModel) {
	printoutput = capture.output(print(randomForestModel))
	varline = grep("explained",printoutput,value=TRUE)

## permute_column.r
# permutes a column in a data.frame, sets seed optionally
permute <- function (dataframe, columnToPermute="column", seed=NULL) {
	if (!is.null(seed)) set.seed(seed)
	colindex <- which(names(dataframe)==columnToPermute)
	permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))]
	dataframe[colindex] <- permutedcol
	return(dataframe)
}

## splitdf.randomize.r
#splitdf splits a data frame into a training and testing set.
#returns a list of two data frames: trainset and testset.
#you can optionally apply a random seed.
splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]

## RF vs lm and FS 1.r
> #First define the function
> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
>
> # first, fit a stepwise model and test it on new data
> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
[1] 0.6991431
>
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list001 --out data.sub.1.1
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list002 --out data.sub.1.2
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list003 --out data.sub.1.3
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list001 tmp.list004 --out data.sub.1.4
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list002 tmp.list002 --out data.sub.2.2
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list002 tmp.list003 --out data.sub.2.3
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list002 tmp.list004 --out data.sub.2.4
	plink --bfile basename --read-freq mymafs.frq --genome --min 0.05 --genome-lists tmp.list003 tmp.list003 --out data.sub.3.3
	plink --bfile basename --read-fr
	#!/usr/bin/perl
	#
	#Parses a .fam file and generates N number of sample list files depending on 2nd argument
	#Generates PBS cluster submission files
	#Justin Giles, Vanderbilt University, 2010
	#
	use strict;

	if(scalar(@ARGV) != 3){
	print "\nPLEASE NOTE: EDIT THIS SCRIPT IN THE PBS AREA TO MODIFY IT TO YOUR NEEDS\n";
	#!/usr/bin/perl

	use strict;

	if(scalar(@ARGV) != 1){
	print "submit_ibd_pbs.pl <num_pbs_to_submit>\n";
	exit;
	}

	my $LIMIT = $ARGV[0];
	# Originally posted at http://gettinggeneticsdone.blogspot.com/2010/07/qq-plots-of-p-values-in-r-using-base.html

	# Define the function
	ggd.qqplot = function(pvector, main=NULL, ...) {
	o = -log10(sort(pvector,decreasing=F))
	e = -log10( 1:length(o)/length(o) )
	plot(e,o,pch=19,cex=1, main=main, ...,
	xlab=expression(Expected~~-log[10](italic(p))),
	ylab=expression(Observed~~-log[10](italic(p))),
	xlim=c(0,max(e)), ylim=c(0,max(o)))
	x=seq(-5,5,.01)
	invlogit=function(x) exp(x)/(1+exp(x))
	y=invlogit(x)
	plot(x,y,pch=16,ylab=expression(paste(logit^{-1},(x))))
	abline(v=0)
	abline(h=.5)
	text(.55,.55,expression(paste("Slope is ",beta/4)),adj=c(0,0))
	rm(list=ls(all=TRUE))
	library(randomForest)

	###############Classification################
	data(iris)
	head(iris)
	iris.rf <- randomForest(Species~., data=iris, importance=T, proximity=T)
	iris.rf.subset <- randomForest(Species~., data=iris[c(1:3,5)], importance=T, proximity=T)
	iris.rf.subset2 <- randomForest(Species~. -Petal.Length -Petal.Width, data=iris, importance=T, proximity=T)
	print(iris.rf)
	library(randomForest)

	###############################################################################
	############################## load functions #################################
	###############################################################################

	# need to document this!
	rfr2 = function(randomForestModel) {
	printoutput = capture.output(print(randomForestModel))
	varline = grep("explained",printoutput,value=TRUE)
	# permutes a column in a data.frame, sets seed optionally
	permute <- function (dataframe, columnToPermute="column", seed=NULL) {
	if (!is.null(seed)) set.seed(seed)
	colindex <- which(names(dataframe)==columnToPermute)
	permutedcol <- dataframe[ ,colindex][sample(1:nrow(dataframe))]
	dataframe[colindex] <- permutedcol
	return(dataframe)
	}
	#splitdf splits a data frame into a training and testing set.
	#returns a list of two data frames: trainset and testset.
	#you can optionally apply a random seed.
	splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 \| trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]
	> #First define the function
	> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
	>
	> # first, fit a stepwise model and test it on new data
	> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
	> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
	> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
	> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
	[1] 0.6991431
	>