jnhutchinson

## custom_gsea.R
# run clusterprofiler
all.results.gsea <- all.results[[1]]$stats.eset # these are all the statistical results from running DESeq2
lfcs <- all.results.gsea$log2FoldChange # the log fold changes from the comparison
names(lfcs) <- all.results.gsea$entrezID # I previously annotated the stats results with the entrezids using biomart
lfcs <- sort(lfcs, decreasing=TRUE)
results = GSEA(lfcs, TERM2GENE=set2gene, minGSSize=200, maxGSSize=1700)  # my set2gene was just a two column dataframe with the geneset id in the first column and the entrez gene ids in the second

## RUVseq.Rmd
# Batch Correction

There is a batch effect, which we will need to eliminate as it is confounded with the sample class. Ideally, all the A samples would cluster together.

Try using [RUVseq](http://www.nature.com/nbt/journal/v32/n9/full/nbt.2931.html) to remove variation associated wtih batch.

Leverage the two technical replicates we have that span the batches.

```{r ruvseqs}
counts <- counts(dds, normalized=TRUE) # count matrix from the DESeq2 object

## hypergeometric.R
#basic hypergeometic overlap
dds #DESeq2 object
totalassayed = 30000 # this is the total number of genes you assayed in common in both comparisons
totalsigA = 300 # this is the total number of genes that were significant in comparison A
totalsigB = 350 # this is the total number oF genes that were significant in comparison B
overlapAB = 100 # this is the number of genes that were significant in BOTH comparisons A and B

prob.overlap  <- 1 - phyper(overlapAB - 1, totalsigA, totalassayed - totalsigA, totalsigB) # probability you would see an overlap this large at random


## bobR.sh
#!/bin/bash
# Wrapper script to run Vim-R-plugin R sessions as interactive jobs on FAS RC Odyssey SLURM platform.
# To use it, edit ~/.vimrc and add 'let vimrplugin_r_path = <path_to_script>'

echo How much memory do you want?
echo "1) 4GB"
echo "2) 8GB"
echo "3) 16GB"
echo "4) 32GB"
echo "5) 64GB"

## PCAplot.r
## colors in colorpalette should be in hexadecimal format, without alpha shading i.e. #FF0000 (red)
## my favorite palette (color blind accessible)
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")
## try to make the colorpalette at least as long as the number of categories
## alpha is a numerical value from 0 to 1

PCAplot <- function(eset=NULL, categories=NULL, title=NULL, colorpalette=NULL, alpha=1){
  # get metadata
  pd <- pData(eset)
# adjust alpha to hexadecimal format

## same.order.func.r
sameorder = function(x,y) {
	all(diff(match(x, y))==1)
	}

## pheatmap.r
# annotations
heatmap.annots <- pData(mic.norm.eset)[,c("ID", "study", "stage", "gender")]
heatmap.annots <- as.data.frame(apply(heatmap.annots, 2, unlist))
row.names(heatmap.annots) <- heatmap.annots$ID
heatmap.annots$ID <- NULL
# annotation colors
study_colors <- c("#FF0000","#00FF00", "#0000FF", cbPalette )
names(study_colors) <- unique(unlist(pd$study))
stage_colors <- c("white", "darkgrey", "black")
names(stage_colors) <- unique(unlist(pd$stage))

## STEM.r
## Rory's kickass annotated dataframe function
annotate_df = function(d) {
  require(biomaRt)
  ensembl = useMart('ensembl', dataset = ensembl_gene)
	a = getBM(attributes=c(filter_type, gene_symbol, "description"),
		filters=c(filter_type), values=d[, 'id'],
		mart=ensembl)
	m = merge(d, a, by.x='id', by.y=filter_type)
	return(m)
}

## lsf.wait.sh
wait.on.lsf() { ## wait on jobs{
    n=`bjobs -P $projectID | awk 'NR>1' | wc -l`
    while [ $n -ne 0 ]
    do
        n=`bjobs -P $projectID | awk 'NR>1' | wc -l`
        # number of running
        echo "Running: "`bjobs -P $projectID | grep RUN | wc -l`
        # number of pending
        echo "Pending: "`bjobs -P $projectID | grep PEND | wc -l`
        sleep 60 ##raising this to 120 will cause LSF to drop the script

## knitr.rmd
### KNITR SETUP

```{r setup, echo=FALSE}
opts_chunk$set(tidy=TRUE, cache=FALSE, highlight=TRUE, figalign="center", warning=FALSE,  error=FALSE, message=FALSE, fig.height=11, fig.width=11)
```

### EXAMPLE CHUNK
```{r libraries}
library(ggplot2)
library(xtable)
	# run clusterprofiler
	all.results.gsea <- all.results[[1]]$stats.eset # these are all the statistical results from running DESeq2
	lfcs <- all.results.gsea$log2FoldChange # the log fold changes from the comparison
	names(lfcs) <- all.results.gsea$entrezID # I previously annotated the stats results with the entrezids using biomart
	lfcs <- sort(lfcs, decreasing=TRUE)
	results = GSEA(lfcs, TERM2GENE=set2gene, minGSSize=200, maxGSSize=1700) # my set2gene was just a two column dataframe with the geneset id in the first column and the entrez gene ids in the second
	# Batch Correction

	There is a batch effect, which we will need to eliminate as it is confounded with the sample class. Ideally, all the A samples would cluster together.

	Try using [RUVseq](http://www.nature.com/nbt/journal/v32/n9/full/nbt.2931.html) to remove variation associated wtih batch.

	Leverage the two technical replicates we have that span the batches.

	```{r ruvseqs}
	counts <- counts(dds, normalized=TRUE) # count matrix from the DESeq2 object
	#basic hypergeometic overlap
	dds #DESeq2 object
	totalassayed = 30000 # this is the total number of genes you assayed in common in both comparisons
	totalsigA = 300 # this is the total number of genes that were significant in comparison A
	totalsigB = 350 # this is the total number oF genes that were significant in comparison B
	overlapAB = 100 # this is the number of genes that were significant in BOTH comparisons A and B

	prob.overlap <- 1 - phyper(overlapAB - 1, totalsigA, totalassayed - totalsigA, totalsigB) # probability you would see an overlap this large at random
	#!/bin/bash
	# Wrapper script to run Vim-R-plugin R sessions as interactive jobs on FAS RC Odyssey SLURM platform.
	# To use it, edit ~/.vimrc and add 'let vimrplugin_r_path = <path_to_script>'

	echo How much memory do you want?
	echo "1) 4GB"
	echo "2) 8GB"
	echo "3) 16GB"
	echo "4) 32GB"
	echo "5) 64GB"
	## colors in colorpalette should be in hexadecimal format, without alpha shading i.e. #FF0000 (red)
	## my favorite palette (color blind accessible)
	cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000")
	## try to make the colorpalette at least as long as the number of categories
	## alpha is a numerical value from 0 to 1

	PCAplot <- function(eset=NULL, categories=NULL, title=NULL, colorpalette=NULL, alpha=1){
	# get metadata
	pd <- pData(eset)
	# adjust alpha to hexadecimal format
	# annotations
	heatmap.annots <- pData(mic.norm.eset)[,c("ID", "study", "stage", "gender")]
	heatmap.annots <- as.data.frame(apply(heatmap.annots, 2, unlist))
	row.names(heatmap.annots) <- heatmap.annots$ID
	heatmap.annots$ID <- NULL
	# annotation colors
	study_colors <- c("#FF0000","#00FF00", "#0000FF", cbPalette )
	names(study_colors) <- unique(unlist(pd$study))
	stage_colors <- c("white", "darkgrey", "black")
	names(stage_colors) <- unique(unlist(pd$stage))
	## Rory's kickass annotated dataframe function
	annotate_df = function(d) {
	require(biomaRt)
	ensembl = useMart('ensembl', dataset = ensembl_gene)
	a = getBM(attributes=c(filter_type, gene_symbol, "description"),
	filters=c(filter_type), values=d[, 'id'],
	mart=ensembl)
	m = merge(d, a, by.x='id', by.y=filter_type)
	return(m)
	}
	wait.on.lsf() { ## wait on jobs{
	n=`bjobs -P $projectID \| awk 'NR>1' \| wc -l`
	while [ $n -ne 0 ]
	do
	n=`bjobs -P $projectID \| awk 'NR>1' \| wc -l`
	# number of running
	echo "Running: "`bjobs -P $projectID \| grep RUN \| wc -l`
	# number of pending
	echo "Pending: "`bjobs -P $projectID \| grep PEND \| wc -l`
	sleep 60 ##raising this to 120 will cause LSF to drop the script
	### KNITR SETUP

	```{r setup, echo=FALSE}
	opts_chunk$set(tidy=TRUE, cache=FALSE, highlight=TRUE, figalign="center", warning=FALSE, error=FALSE, message=FALSE, fig.height=11, fig.width=11)
	```

	### EXAMPLE CHUNK
	```{r libraries}
	library(ggplot2)
	library(xtable)