rtraborn/figure_3_graph.R

## figure_3_graph.R
# Generating the figure showing the distribution between identified TSRs and TATA motifs
# Written on 1/9/2015

#loading lattice
library(lattice)

#setting the working directory
setwd("/DATA/GROUP/rtraborn/tsrchitect_test/")

#reading the distribution file (created by closeestBed; see documentation) into R
TSR_genes_dist <- read.table(file="distribution_TSR_genes_output.txt",header=FALSE)

#applying column names to the imported data frame
names(TSR_genes_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_genes_dist)
summary(TSR_genes_dist$dist)
cat(paste("There are", nrow(TSR_genes_dist),"comparisons in this sample.","\n",sep=" "))
cat("\n")

#setting up plotting device
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4b_TSR_genes_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_genes_dist$distance)
plot(d,xlab="Position of genes relative to identified TSRs \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of genes relative to putative promoters identified using TSRchitect",col="darkgrey",lwd=3,xlim=c(-200,200))
dev.off()

#reading the distribution file (created by closeestBed; see documentation) into R
TSR_TATA_dist <- read.table(file="distribution_TSR_TATA_output.txt",header=FALSE)

#applying column names to the imported data frame
names(TSR_TATA_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

#removing rows that have distances that are greater than [-100,50]
distribution_index1 <- which(TSR_TATA_dist$distance > -100)
TSR_TATA_dist_mod <- TSR_TATA_dist[distribution_index1,]
distribution_index2 <- which(TSR_TATA_dist_mod$distance < 50)
TSR_TATA_dist_mod <- TSR_TATA_dist_mod[distribution_index2,]

#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_TATA_dist_mod)
summary(TSR_TATA_dist_mod$dist)
cat(paste("There are", nrow(TSR_TATA_dist_mod),"TATA promoters in this sample.","\n",sep=" "))
cat("\n")

#setting up plotting device
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4c_TSR_data_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_TATA_dist_mod$distance)
plot(d,xlab="Position of TATA motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of TATA elements relative to putative promoters identified using TSRchitect",col="darkred",lwd=3)
dev.off()

#reading the distribution file (created by closeestBed; see documentation) into R
TSR_Inr_dist <- read.table(file="distribution_TSR_Inr_output.txt",header=FALSE)

#applying column names to the imported data frame
names(TSR_Inr_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

#removing rows that have distances that are greater than 25
distribution_index1 <- which(TSR_Inr_dist$distance > -25)
TSR_Inr_dist_mod <- TSR_Inr_dist[distribution_index1,]
distribution_index2 <- which(TSR_Inr_dist_mod$distance < 10)
TSR_Inr_dist_mod <- TSR_Inr_dist_mod[distribution_index2,]

#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_Inr_dist_mod)
summary(TSR_Inr_dist_mod$distance)
cat(paste("There are", nrow(TSR_Inr_dist_mod),"Initiator promoters in this sample.","\n",sep=" "))
cat("\n")

#setting up plotting device- Inr
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4d_TSR_data_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_Inr_dist_mod$distance)
plot(d,xlab="Position of Inr motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Initiator (Inr) elements relative to putative promoters identified using TSRchitect",col="darkgreen",lwd=3)
dev.off()

#reading the distribution file (created by closeestBed; see documentation) into R
TSR_YPatch_dist <- read.table(file="distribution_TSR_YPatch_output.txt",header=FALSE)

#applying column names to the imported data frame
names(TSR_YPatch_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

#removing rows that have distances that are greater than [-100,50]
distribution_index1 <- which(TSR_YPatch_dist$distance > -100)
TSR_YPatch_dist_mod <- TSR_YPatch_dist[distribution_index1,]
distribution_index2 <- which(TSR_YPatch_dist_mod$distance < 50)
TSR_YPatch_dist_mod <- TSR_YPatch_dist_mod[distribution_index2,]

#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_YPatch_dist_mod)
summary(TSR_YPatch_dist_mod$distance)
cat(paste("There are", nrow(TSR_YPatch_dist_mod),"Y-patch promoters in this sample.","\n",sep=" "))
cat("\n")

#setting up plotting device- Y-Patch
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4e_TSR_data_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_YPatch_dist_mod$distance)
plot(d,xlab="Position of Y-Patch motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Y-Patch (YY) elements relative to putative promoters identified using TSRchitect",col="gold",lwd=3)
dev.off()


## figure_3_workflow.R
# Code needed to generate Figure 3, which is a validation or correlation between annotated TSRs and elements consistent with promoter

#loading the dependencies
library('GenomicRanges')

#setting the working directory as appropriate
setwd("/DATA/GROUP/rtraborn/tsrchitect_test/")

#importing the appropriate data files
TATA_bed <- read.table(file="TATA_agris_tair10_flank3.bed",header=FALSE)

#naming the appropriate BED columns
names(TATA_bed) <- c("chr","source","id","start", "end","motif","score","strand")

#crating GenomicRanges object from the TATA_bed file
TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), id=id))

#importing the putative promoter annotations that were made by TSRchitect
At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_final3.bed",header=T)

#naming the columns from the above BED file
names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint")

#creating a GenomicRanges object from the At_PEAT_TSRs bed file
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id))

#finding the overlaps between TSRs and TATA elements
TSR_TATA_ol <- findOverlaps(TSRs_bed,TATA_gr)

#this should display the OL file
TSR_TATA_ol

## this works fine to this point 1/2/2015; flanking regions have been added to the TATA files

##need to write an anchorplot workflow so the distributions can be visualized

as.data.frame(TSR_TATA_ol) -> TSR_TATA_ol_df

print(head(TSR_TATA_ol_df)) #taking a look at what the data frame looks like
print(head(TSRs_bed)) #ibid
print(head(TATA_gr)) #ibid

#currently (1/6/15) at 4:50PM works without issue. Now need to write the distribution script, with midpoint vs TATA boxes

##invoking bedtools within R

#file names
a.file <- tempfile()
b.file <- tempfile()
out <- tempfile()
options(scipen=99) #turns off scientific notation

#optional flags for the bedtools command
opt.string <- c("-s -D a")

#writing bed-formatted dataframes to tempfiles
write.table("atPEAT_TSRs_final3.bed",sep="\t",file=a.file,quote=F,col.names=F,row.names=F)
write.table("TATA_agris_tair10_complete_final2.bed",sep="\t",file=b.file,quote=F,col.names=F,row.names=F)

#the actual bed command
bedtools_closest <- paste("closestBed","-a",a.file,"-b",b.file,opt.string,">",out,sep=" ")
cat(bedtools_closest,"\n")
try(system(bedtools_closest))

res <- read.table(out,header=F)
unlink(a.file);unlink(b.file);unlink(out)
print(head(res)) #let's see what the output of this looks like

##current issue when this script runs
#> try(system(bedtools_closest))
#It looks as though you have less than 3 columns at line: 1.  Are you sure your files are tab-delimited?
#>
#> res <- read.table(out,header=F)
#Error in read.table(out, header = F) : no lines available in input

#possible solution: https://groups.google.com/forum/#!topic/bedtools-discuss/XshPkgD-7Ig

## figure_3_workflow.sh
#/usr/bin/sh

cd /DATA/GROUP/rtraborn/tsrchitect_test

echo "Calculating the distance between each identified TSR and coding genes in A. thaliana"

closestBed -b TAIR10_genes.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_genes_output.txt

echo "Calculating the distance between each identified TSR and TATA motifs in A. thaliana"

closestBed -b TATA_agris_tair10.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_TATA_output.txt

mv distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

echo "Calculating the distance between each identified TSR and Initiator (Inr) motifs in A. thaliana"

closestBed -b Inr_pprom_tair10_final.bed  -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_Inr_output.txt

mv distribution_TSR_Inr_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

echo "Calculating the distance between each identified TSR and Y-Patch (YY) motifs in A. thalaina"

closestBed -b Y_patch_tair10_final.bed  -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_YPatch_output.txt

mv distribution_TSR_YPatch_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

echo "Now starting the analysis for Rice (Oryza sativa japonica)"

cd /DATA/GROUP/rtraborn/tsrchitect_test

echo "Calculating the distance between each identified TSR and coding genes in O. sativa"

closestBed -b o_sativa_genes.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_genes_output.txt

mv os_distribution_TSR_genes_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

echo "Calculating the distance between each identified TSR and TATA motifs in O. sativa"

closestBed -b os_TATA_agris_final.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_TATA_output.txt

mv os_distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

echo "closestBed Analysis is complete"

echo "Generating Distribution Figures (Figure 3 parts b-d)"

Rscript /home/rtraborn/manuscripts/tsrchitect_figures/figure_repos/Figure_3/figure_3_graph.R

echo "Figure Completed"

echo "Success! Figure 3 workflow is complete"

## figure_3a_workflow.R
##creates a pileup heatmap of motifs observed in promoters
## Created on 11 March 2015
#Dependencies (uncomment when necessary)

#install.packages( c("data.table","plyr","reshape2","ggplot2","gridBase","devtools"))
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("GenomicRanges","rtracklayer","impute","Rsamtools"))

#install the packages
library(devtools)
library(impute)
library(GenomicRanges)

#install the data package to be able to run examples in the vignette
#install_github("frenkiboy/genomationData",build_vignettes=FALSE)
library(genomation)

# create GRanges objects for TSR and TATA elements, respectively
#set working directory:
setwd("/DATA/GROUP/rtraborn/tsrchitect_test")

#importing the appropriate TATA bedfile
TATA_bed <- read.table(file="TATA_1k_window.bed",header=FALSE)

TATA_ind <- which(TATA_bed[,7] >= -200 & TATA_bed[,7]<= 50)
print(length(TATA_ind))
total_index <- 1:nrow(TATA_bed)
no.TATA <- total_index[-TATA_ind]
print(length(no.TATA))

#naming the appropriate BED columns
names(TATA_bed) <- c("chr","start", "end","id","score","strand","distance")

#importinug the appropriate Initiator (Inr) bedfile
Inr_bed <- read.table(file="Inr_1k_window.bed",header=FALSE)

Inr_ind <- which(Inr_bed[,7] >= -50 & Inr_bed[,7]<=10) #11 Mar 6:39PM these logicals need fixing- only returning a small fraction of promoters
print(length(Inr_ind))
print(head(Inr_ind))
total_index <- 1:nrow(Inr_bed)
no.Inr <- total_index[-Inr_ind]
print(length(no.Inr))
print(head(no.Inr))

#naming the appropriate BED columns
names(Inr_bed) <- c("chr","start", "end","id","score","strand","distance")

#importing the appropriate Y-patch bedfile
YY_bed <- read.table(file="YY_1k_window.bed",header=FALSE)

YY_ind <- which(YY_bed[,7]>= -250 & YY_bed[,7] <=50)
total_index <- 1:nrow(Inr_bed)
no.YY <- total_index[-YY_ind]

#Naming the appropriate BED columns
names(YY_bed) <- c("chr","start", "end","id","score","strand","distance")

#crating GenomicRanges object from the above motif bedfiles
TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), strand=strand,id=id))
Inr_gr <- with(Inr_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id))
YY_gr <- with(YY_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id))

#importing the putative promoter annotations that were made by TSRchitect
At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_mid_flank2.bed",header=T)

#naming the columns from the above BED file
names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint")

#creating a GenomicRanges object from the At_PEAT_TSRs bed file
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id))

#creating a scoring matrix between TSRs and TATA elements
#TSR_TATA <- ScoreMatrix(target=TATA_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)

#creating a scoring matrix between TSRs and Inr elements
#TSR_Inr <- ScoreMatrix(target=Inr_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)

#creating a scoring matrix between TSRs and Inr elements
#TSR_YY <- ScoreMatrix(target=YY_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)

#creating a list of targets
targets <- list(TATA=TATA_gr,Inr=Inr_gr,YY=YY_gr)
promoters <- TSRs_bed

#visualizing the overlaps using a heat map
sm1=ScoreMatrixList(target=targets,windows=promoters,strand.aware=TRUE)
sm2=ScoreMatrix(target=TATA_gr,windows=promoters,strand.aware=TRUE)
#sm3=ScoreMatrix(target=YY_gr,windows=promoters,strand.aware=TRUE)

png(file="pileup_heatmap.png")
#par(mfrow=c(1,3))
#multiHeatMatrix(sm1,xcoords=c(-1000,1000),order=TRUE)
heatMatrix(sm2,xcoords=c(-1000,1000),k=3,kmeans=TRUE)
#heatMatrix(sm3,xcoords=c(-1000,1000),order=TRUE)

png(file="meta_heatmap.png")
#par(mfrow=c(1,3))
#multiHeatMatrix(sml.scaled,kmeans=TRUE,k=8,cex.axis=0.8,xcoords=c(-1000,250),col=c("lightgray","blue"))
plotMeta(sm1,xcoords=c(-1000,1000))
#plotMeta(sm2,xcoords=c(-1000,1000))
#plotMeta(sm3,xcoords=c(-1000,1000))

dev.off()


## figure_3b_TSR_genes_dist.png

      
    Raw
  

              figure_3b_TSR_genes_dist.png
            
          
## figure_3c_TSR_data_dist.png

      
    Raw
  

              figure_3c_TSR_data_dist.png
            
          
## figure_3d_TSR_data_dist.png

      
    Raw
  

              figure_3d_TSR_data_dist.png
            
          
## figure_3e_TSR_data_dist.png

      
    Raw
  

              figure_3e_TSR_data_dist.png
            
          
## meta_heatmap.png

      
    Raw
  

              meta_heatmap.png
            
          
## pileup_heatmap.sh
#/usr/bin/sh

cd /DATA/GROUP/rtraborn/tsrchitect_test

echo "Finding the closest motifs to the TSR midpoint using closestBed"

closestBed -b TATA_agris_tair10_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_TATA_1k_window.bed
closestBed -b Inr_pprom_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_Inr_1k_window.bed
closestBed -b Y_patch_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_YY_1k_window.bed

echo "Processing the output into motif-focused bed files"

cut -f 8-14 TSR_TATA_1k_window.bed > TATA_1k_window.bed
cut -f 8-14 TSR_Inr_1k_window.bed > Inr_1k_window.bed
cut -f 8-14 TSR_YY_1k_window.bed > YY_1k_window.bed

echo "Success! Workflow is complete"
	# Generating the figure showing the distribution between identified TSRs and TATA motifs
	# Written on 1/9/2015

	#loading lattice
	library(lattice)

	#setting the working directory
	setwd("/DATA/GROUP/rtraborn/tsrchitect_test/")

	#reading the distribution file (created by closeestBed; see documentation) into R
	TSR_genes_dist <- read.table(file="distribution_TSR_genes_output.txt",header=FALSE)

	#applying column names to the imported data frame
	names(TSR_genes_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

	#making sure that everything is on the up and up with the file (for debugging purposes)
	head(TSR_genes_dist)
	summary(TSR_genes_dist$dist)
	cat(paste("There are", nrow(TSR_genes_dist),"comparisons in this sample.","\n",sep=" "))
	cat("\n")

	#setting up plotting device
	setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
	png(file="figure_4b_TSR_genes_dist.png",width=960,height=960)
	plot.new()
	d <- density(TSR_genes_dist$distance)
	plot(d,xlab="Position of genes relative to identified TSRs \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of genes relative to putative promoters identified using TSRchitect",col="darkgrey",lwd=3,xlim=c(-200,200))
	dev.off()

	#reading the distribution file (created by closeestBed; see documentation) into R
	TSR_TATA_dist <- read.table(file="distribution_TSR_TATA_output.txt",header=FALSE)

	#applying column names to the imported data frame
	names(TSR_TATA_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

	#removing rows that have distances that are greater than [-100,50]
	distribution_index1 <- which(TSR_TATA_dist$distance > -100)
	TSR_TATA_dist_mod <- TSR_TATA_dist[distribution_index1,]
	distribution_index2 <- which(TSR_TATA_dist_mod$distance < 50)
	TSR_TATA_dist_mod <- TSR_TATA_dist_mod[distribution_index2,]

	#making sure that everything is on the up and up with the file (for debugging purposes)
	head(TSR_TATA_dist_mod)
	summary(TSR_TATA_dist_mod$dist)
	cat(paste("There are", nrow(TSR_TATA_dist_mod),"TATA promoters in this sample.","\n",sep=" "))
	cat("\n")

	#setting up plotting device
	setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
	png(file="figure_4c_TSR_data_dist.png",width=960,height=960)
	plot.new()
	d <- density(TSR_TATA_dist_mod$distance)
	plot(d,xlab="Position of TATA motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of TATA elements relative to putative promoters identified using TSRchitect",col="darkred",lwd=3)
	dev.off()

	#reading the distribution file (created by closeestBed; see documentation) into R
	TSR_Inr_dist <- read.table(file="distribution_TSR_Inr_output.txt",header=FALSE)

	#applying column names to the imported data frame
	names(TSR_Inr_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

	#removing rows that have distances that are greater than 25
	distribution_index1 <- which(TSR_Inr_dist$distance > -25)
	TSR_Inr_dist_mod <- TSR_Inr_dist[distribution_index1,]
	distribution_index2 <- which(TSR_Inr_dist_mod$distance < 10)
	TSR_Inr_dist_mod <- TSR_Inr_dist_mod[distribution_index2,]

	#making sure that everything is on the up and up with the file (for debugging purposes)
	head(TSR_Inr_dist_mod)
	summary(TSR_Inr_dist_mod$distance)
	cat(paste("There are", nrow(TSR_Inr_dist_mod),"Initiator promoters in this sample.","\n",sep=" "))
	cat("\n")

	#setting up plotting device- Inr
	setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
	png(file="figure_4d_TSR_data_dist.png",width=960,height=960)
	plot.new()
	d <- density(TSR_Inr_dist_mod$distance)
	plot(d,xlab="Position of Inr motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Initiator (Inr) elements relative to putative promoters identified using TSRchitect",col="darkgreen",lwd=3)
	dev.off()

	#reading the distribution file (created by closeestBed; see documentation) into R
	TSR_YPatch_dist <- read.table(file="distribution_TSR_YPatch_output.txt",header=FALSE)

	#applying column names to the imported data frame
	names(TSR_YPatch_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")

	#removing rows that have distances that are greater than [-100,50]
	distribution_index1 <- which(TSR_YPatch_dist$distance > -100)
	TSR_YPatch_dist_mod <- TSR_YPatch_dist[distribution_index1,]
	distribution_index2 <- which(TSR_YPatch_dist_mod$distance < 50)
	TSR_YPatch_dist_mod <- TSR_YPatch_dist_mod[distribution_index2,]

	#making sure that everything is on the up and up with the file (for debugging purposes)
	head(TSR_YPatch_dist_mod)
	summary(TSR_YPatch_dist_mod$distance)
	cat(paste("There are", nrow(TSR_YPatch_dist_mod),"Y-patch promoters in this sample.","\n",sep=" "))
	cat("\n")

	#setting up plotting device- Y-Patch
	setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
	png(file="figure_4e_TSR_data_dist.png",width=960,height=960)
	plot.new()
	d <- density(TSR_YPatch_dist_mod$distance)
	plot(d,xlab="Position of Y-Patch motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Y-Patch (YY) elements relative to putative promoters identified using TSRchitect",col="gold",lwd=3)
	dev.off()
	# Code needed to generate Figure 3, which is a validation or correlation between annotated TSRs and elements consistent with promoter

	#loading the dependencies
	library('GenomicRanges')

	#setting the working directory as appropriate
	setwd("/DATA/GROUP/rtraborn/tsrchitect_test/")

	#importing the appropriate data files
	TATA_bed <- read.table(file="TATA_agris_tair10_flank3.bed",header=FALSE)

	#naming the appropriate BED columns
	names(TATA_bed) <- c("chr","source","id","start", "end","motif","score","strand")

	#crating GenomicRanges object from the TATA_bed file
	TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), id=id))

	#importing the putative promoter annotations that were made by TSRchitect
	At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_final3.bed",header=T)

	#naming the columns from the above BED file
	names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint")

	#creating a GenomicRanges object from the At_PEAT_TSRs bed file
	TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id))

	#finding the overlaps between TSRs and TATA elements
	TSR_TATA_ol <- findOverlaps(TSRs_bed,TATA_gr)

	#this should display the OL file
	TSR_TATA_ol

	## this works fine to this point 1/2/2015; flanking regions have been added to the TATA files

	##need to write an anchorplot workflow so the distributions can be visualized

	as.data.frame(TSR_TATA_ol) -> TSR_TATA_ol_df

	print(head(TSR_TATA_ol_df)) #taking a look at what the data frame looks like
	print(head(TSRs_bed)) #ibid
	print(head(TATA_gr)) #ibid

	#currently (1/6/15) at 4:50PM works without issue. Now need to write the distribution script, with midpoint vs TATA boxes

	##invoking bedtools within R

	#file names
	a.file <- tempfile()
	b.file <- tempfile()
	out <- tempfile()
	options(scipen=99) #turns off scientific notation

	#optional flags for the bedtools command
	opt.string <- c("-s -D a")

	#writing bed-formatted dataframes to tempfiles
	write.table("atPEAT_TSRs_final3.bed",sep="\t",file=a.file,quote=F,col.names=F,row.names=F)
	write.table("TATA_agris_tair10_complete_final2.bed",sep="\t",file=b.file,quote=F,col.names=F,row.names=F)

	#the actual bed command
	bedtools_closest <- paste("closestBed","-a",a.file,"-b",b.file,opt.string,">",out,sep=" ")
	cat(bedtools_closest,"\n")
	try(system(bedtools_closest))

	res <- read.table(out,header=F)
	unlink(a.file);unlink(b.file);unlink(out)
	print(head(res)) #let's see what the output of this looks like

	##current issue when this script runs
	#> try(system(bedtools_closest))
	#It looks as though you have less than 3 columns at line: 1. Are you sure your files are tab-delimited?
	#>
	#> res <- read.table(out,header=F)
	#Error in read.table(out, header = F) : no lines available in input

	#possible solution: https://groups.google.com/forum/#!topic/bedtools-discuss/XshPkgD-7Ig
	#/usr/bin/sh

	cd /DATA/GROUP/rtraborn/tsrchitect_test

	echo "Calculating the distance between each identified TSR and coding genes in A. thaliana"

	closestBed -b TAIR10_genes.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_genes_output.txt

	echo "Calculating the distance between each identified TSR and TATA motifs in A. thaliana"

	closestBed -b TATA_agris_tair10.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_TATA_output.txt

	mv distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

	echo "Calculating the distance between each identified TSR and Initiator (Inr) motifs in A. thaliana"

	closestBed -b Inr_pprom_tair10_final.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_Inr_output.txt

	mv distribution_TSR_Inr_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

	echo "Calculating the distance between each identified TSR and Y-Patch (YY) motifs in A. thalaina"

	closestBed -b Y_patch_tair10_final.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_YPatch_output.txt

	mv distribution_TSR_YPatch_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

	echo "Now starting the analysis for Rice (Oryza sativa japonica)"

	cd /DATA/GROUP/rtraborn/tsrchitect_test

	echo "Calculating the distance between each identified TSR and coding genes in O. sativa"

	closestBed -b o_sativa_genes.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_genes_output.txt

	mv os_distribution_TSR_genes_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

	echo "Calculating the distance between each identified TSR and TATA motifs in O. sativa"

	closestBed -b os_TATA_agris_final.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_TATA_output.txt

	mv os_distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/

	echo "closestBed Analysis is complete"

	echo "Generating Distribution Figures (Figure 3 parts b-d)"

	Rscript /home/rtraborn/manuscripts/tsrchitect_figures/figure_repos/Figure_3/figure_3_graph.R

	echo "Figure Completed"

	echo "Success! Figure 3 workflow is complete"
	##creates a pileup heatmap of motifs observed in promoters
	## Created on 11 March 2015
	#Dependencies (uncomment when necessary)

	#install.packages( c("data.table","plyr","reshape2","ggplot2","gridBase","devtools"))
	#source("http://bioconductor.org/biocLite.R")
	#biocLite(c("GenomicRanges","rtracklayer","impute","Rsamtools"))

	#install the packages
	library(devtools)
	library(impute)
	library(GenomicRanges)

	#install the data package to be able to run examples in the vignette
	#install_github("frenkiboy/genomationData",build_vignettes=FALSE)
	library(genomation)

	# create GRanges objects for TSR and TATA elements, respectively
	#set working directory:
	setwd("/DATA/GROUP/rtraborn/tsrchitect_test")

	#importing the appropriate TATA bedfile
	TATA_bed <- read.table(file="TATA_1k_window.bed",header=FALSE)

	TATA_ind <- which(TATA_bed[,7] >= -200 & TATA_bed[,7]<= 50)
	print(length(TATA_ind))
	total_index <- 1:nrow(TATA_bed)
	no.TATA <- total_index[-TATA_ind]
	print(length(no.TATA))

	#naming the appropriate BED columns
	names(TATA_bed) <- c("chr","start", "end","id","score","strand","distance")

	#importinug the appropriate Initiator (Inr) bedfile
	Inr_bed <- read.table(file="Inr_1k_window.bed",header=FALSE)

	Inr_ind <- which(Inr_bed[,7] >= -50 & Inr_bed[,7]<=10) #11 Mar 6:39PM these logicals need fixing- only returning a small fraction of promoters
	print(length(Inr_ind))
	print(head(Inr_ind))
	total_index <- 1:nrow(Inr_bed)
	no.Inr <- total_index[-Inr_ind]
	print(length(no.Inr))
	print(head(no.Inr))

	#naming the appropriate BED columns
	names(Inr_bed) <- c("chr","start", "end","id","score","strand","distance")

	#importing the appropriate Y-patch bedfile
	YY_bed <- read.table(file="YY_1k_window.bed",header=FALSE)

	YY_ind <- which(YY_bed[,7]>= -250 & YY_bed[,7] <=50)
	total_index <- 1:nrow(Inr_bed)
	no.YY <- total_index[-YY_ind]

	#Naming the appropriate BED columns
	names(YY_bed) <- c("chr","start", "end","id","score","strand","distance")

	#crating GenomicRanges object from the above motif bedfiles
	TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), strand=strand,id=id))
	Inr_gr <- with(Inr_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id))
	YY_gr <- with(YY_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id))

	#importing the putative promoter annotations that were made by TSRchitect
	At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_mid_flank2.bed",header=T)

	#naming the columns from the above BED file
	names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint")

	#creating a GenomicRanges object from the At_PEAT_TSRs bed file
	TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id))

	#creating a scoring matrix between TSRs and TATA elements
	#TSR_TATA <- ScoreMatrix(target=TATA_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)

	#creating a scoring matrix between TSRs and Inr elements
	#TSR_Inr <- ScoreMatrix(target=Inr_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)

	#creating a scoring matrix between TSRs and Inr elements
	#TSR_YY <- ScoreMatrix(target=YY_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)

	#creating a list of targets
	targets <- list(TATA=TATA_gr,Inr=Inr_gr,YY=YY_gr)
	promoters <- TSRs_bed

	#visualizing the overlaps using a heat map
	sm1=ScoreMatrixList(target=targets,windows=promoters,strand.aware=TRUE)
	sm2=ScoreMatrix(target=TATA_gr,windows=promoters,strand.aware=TRUE)
	#sm3=ScoreMatrix(target=YY_gr,windows=promoters,strand.aware=TRUE)

	png(file="pileup_heatmap.png")
	#par(mfrow=c(1,3))
	#multiHeatMatrix(sm1,xcoords=c(-1000,1000),order=TRUE)
	heatMatrix(sm2,xcoords=c(-1000,1000),k=3,kmeans=TRUE)
	#heatMatrix(sm3,xcoords=c(-1000,1000),order=TRUE)

	png(file="meta_heatmap.png")
	#par(mfrow=c(1,3))
	#multiHeatMatrix(sml.scaled,kmeans=TRUE,k=8,cex.axis=0.8,xcoords=c(-1000,250),col=c("lightgray","blue"))
	plotMeta(sm1,xcoords=c(-1000,1000))
	#plotMeta(sm2,xcoords=c(-1000,1000))
	#plotMeta(sm3,xcoords=c(-1000,1000))

	dev.off()
	#/usr/bin/sh

	cd /DATA/GROUP/rtraborn/tsrchitect_test

	echo "Finding the closest motifs to the TSR midpoint using closestBed"

	closestBed -b TATA_agris_tair10_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_TATA_1k_window.bed
	closestBed -b Inr_pprom_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_Inr_1k_window.bed
	closestBed -b Y_patch_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_YY_1k_window.bed

	echo "Processing the output into motif-focused bed files"

	cut -f 8-14 TSR_TATA_1k_window.bed > TATA_1k_window.bed
	cut -f 8-14 TSR_Inr_1k_window.bed > Inr_1k_window.bed
	cut -f 8-14 TSR_YY_1k_window.bed > YY_1k_window.bed

	echo "Success! Workflow is complete"