Skip to content

Instantly share code, notes, and snippets.

@rtraborn
Last active August 29, 2015 14:11
Show Gist options
  • Save rtraborn/3be132fbd17aa4377288 to your computer and use it in GitHub Desktop.
Save rtraborn/3be132fbd17aa4377288 to your computer and use it in GitHub Desktop.
Figure 4: Validation of TSR Annotations
# Generating the figure showing the distribution between identified TSRs and TATA motifs
# Written on 1/9/2015
#loading lattice
library(lattice)
#setting the working directory
setwd("/DATA/GROUP/rtraborn/tsrchitect_test/")
#reading the distribution file (created by closeestBed; see documentation) into R
TSR_genes_dist <- read.table(file="distribution_TSR_genes_output.txt",header=FALSE)
#applying column names to the imported data frame
names(TSR_genes_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")
#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_genes_dist)
summary(TSR_genes_dist$dist)
cat(paste("There are", nrow(TSR_genes_dist),"comparisons in this sample.","\n",sep=" "))
cat("\n")
#setting up plotting device
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4b_TSR_genes_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_genes_dist$distance)
plot(d,xlab="Position of genes relative to identified TSRs \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of genes relative to putative promoters identified using TSRchitect",col="darkgrey",lwd=3,xlim=c(-200,200))
dev.off()
#reading the distribution file (created by closeestBed; see documentation) into R
TSR_TATA_dist <- read.table(file="distribution_TSR_TATA_output.txt",header=FALSE)
#applying column names to the imported data frame
names(TSR_TATA_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")
#removing rows that have distances that are greater than [-100,50]
distribution_index1 <- which(TSR_TATA_dist$distance > -100)
TSR_TATA_dist_mod <- TSR_TATA_dist[distribution_index1,]
distribution_index2 <- which(TSR_TATA_dist_mod$distance < 50)
TSR_TATA_dist_mod <- TSR_TATA_dist_mod[distribution_index2,]
#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_TATA_dist_mod)
summary(TSR_TATA_dist_mod$dist)
cat(paste("There are", nrow(TSR_TATA_dist_mod),"TATA promoters in this sample.","\n",sep=" "))
cat("\n")
#setting up plotting device
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4c_TSR_data_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_TATA_dist_mod$distance)
plot(d,xlab="Position of TATA motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of TATA elements relative to putative promoters identified using TSRchitect",col="darkred",lwd=3)
dev.off()
#reading the distribution file (created by closeestBed; see documentation) into R
TSR_Inr_dist <- read.table(file="distribution_TSR_Inr_output.txt",header=FALSE)
#applying column names to the imported data frame
names(TSR_Inr_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")
#removing rows that have distances that are greater than 25
distribution_index1 <- which(TSR_Inr_dist$distance > -25)
TSR_Inr_dist_mod <- TSR_Inr_dist[distribution_index1,]
distribution_index2 <- which(TSR_Inr_dist_mod$distance < 10)
TSR_Inr_dist_mod <- TSR_Inr_dist_mod[distribution_index2,]
#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_Inr_dist_mod)
summary(TSR_Inr_dist_mod$distance)
cat(paste("There are", nrow(TSR_Inr_dist_mod),"Initiator promoters in this sample.","\n",sep=" "))
cat("\n")
#setting up plotting device- Inr
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4d_TSR_data_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_Inr_dist_mod$distance)
plot(d,xlab="Position of Inr motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Initiator (Inr) elements relative to putative promoters identified using TSRchitect",col="darkgreen",lwd=3)
dev.off()
#reading the distribution file (created by closeestBed; see documentation) into R
TSR_YPatch_dist <- read.table(file="distribution_TSR_YPatch_output.txt",header=FALSE)
#applying column names to the imported data frame
names(TSR_YPatch_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance")
#removing rows that have distances that are greater than [-100,50]
distribution_index1 <- which(TSR_YPatch_dist$distance > -100)
TSR_YPatch_dist_mod <- TSR_YPatch_dist[distribution_index1,]
distribution_index2 <- which(TSR_YPatch_dist_mod$distance < 50)
TSR_YPatch_dist_mod <- TSR_YPatch_dist_mod[distribution_index2,]
#making sure that everything is on the up and up with the file (for debugging purposes)
head(TSR_YPatch_dist_mod)
summary(TSR_YPatch_dist_mod$distance)
cat(paste("There are", nrow(TSR_YPatch_dist_mod),"Y-patch promoters in this sample.","\n",sep=" "))
cat("\n")
#setting up plotting device- Y-Patch
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/")
png(file="figure_4e_TSR_data_dist.png",width=960,height=960)
plot.new()
d <- density(TSR_YPatch_dist_mod$distance)
plot(d,xlab="Position of Y-Patch motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Y-Patch (YY) elements relative to putative promoters identified using TSRchitect",col="gold",lwd=3)
dev.off()
# Code needed to generate Figure 3, which is a validation or correlation between annotated TSRs and elements consistent with promoter
#loading the dependencies
library('GenomicRanges')
#setting the working directory as appropriate
setwd("/DATA/GROUP/rtraborn/tsrchitect_test/")
#importing the appropriate data files
TATA_bed <- read.table(file="TATA_agris_tair10_flank3.bed",header=FALSE)
#naming the appropriate BED columns
names(TATA_bed) <- c("chr","source","id","start", "end","motif","score","strand")
#crating GenomicRanges object from the TATA_bed file
TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), id=id))
#importing the putative promoter annotations that were made by TSRchitect
At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_final3.bed",header=T)
#naming the columns from the above BED file
names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint")
#creating a GenomicRanges object from the At_PEAT_TSRs bed file
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id))
#finding the overlaps between TSRs and TATA elements
TSR_TATA_ol <- findOverlaps(TSRs_bed,TATA_gr)
#this should display the OL file
TSR_TATA_ol
## this works fine to this point 1/2/2015; flanking regions have been added to the TATA files
##need to write an anchorplot workflow so the distributions can be visualized
as.data.frame(TSR_TATA_ol) -> TSR_TATA_ol_df
print(head(TSR_TATA_ol_df)) #taking a look at what the data frame looks like
print(head(TSRs_bed)) #ibid
print(head(TATA_gr)) #ibid
#currently (1/6/15) at 4:50PM works without issue. Now need to write the distribution script, with midpoint vs TATA boxes
##invoking bedtools within R
#file names
a.file <- tempfile()
b.file <- tempfile()
out <- tempfile()
options(scipen=99) #turns off scientific notation
#optional flags for the bedtools command
opt.string <- c("-s -D a")
#writing bed-formatted dataframes to tempfiles
write.table("atPEAT_TSRs_final3.bed",sep="\t",file=a.file,quote=F,col.names=F,row.names=F)
write.table("TATA_agris_tair10_complete_final2.bed",sep="\t",file=b.file,quote=F,col.names=F,row.names=F)
#the actual bed command
bedtools_closest <- paste("closestBed","-a",a.file,"-b",b.file,opt.string,">",out,sep=" ")
cat(bedtools_closest,"\n")
try(system(bedtools_closest))
res <- read.table(out,header=F)
unlink(a.file);unlink(b.file);unlink(out)
print(head(res)) #let's see what the output of this looks like
##current issue when this script runs
#> try(system(bedtools_closest))
#It looks as though you have less than 3 columns at line: 1. Are you sure your files are tab-delimited?
#>
#> res <- read.table(out,header=F)
#Error in read.table(out, header = F) : no lines available in input
#possible solution: https://groups.google.com/forum/#!topic/bedtools-discuss/XshPkgD-7Ig
#/usr/bin/sh
cd /DATA/GROUP/rtraborn/tsrchitect_test
echo "Calculating the distance between each identified TSR and coding genes in A. thaliana"
closestBed -b TAIR10_genes.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_genes_output.txt
echo "Calculating the distance between each identified TSR and TATA motifs in A. thaliana"
closestBed -b TATA_agris_tair10.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_TATA_output.txt
mv distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/
echo "Calculating the distance between each identified TSR and Initiator (Inr) motifs in A. thaliana"
closestBed -b Inr_pprom_tair10_final.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_Inr_output.txt
mv distribution_TSR_Inr_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/
echo "Calculating the distance between each identified TSR and Y-Patch (YY) motifs in A. thalaina"
closestBed -b Y_patch_tair10_final.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_YPatch_output.txt
mv distribution_TSR_YPatch_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/
echo "Now starting the analysis for Rice (Oryza sativa japonica)"
cd /DATA/GROUP/rtraborn/tsrchitect_test
echo "Calculating the distance between each identified TSR and coding genes in O. sativa"
closestBed -b o_sativa_genes.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_genes_output.txt
mv os_distribution_TSR_genes_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/
echo "Calculating the distance between each identified TSR and TATA motifs in O. sativa"
closestBed -b os_TATA_agris_final.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_TATA_output.txt
mv os_distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/
echo "closestBed Analysis is complete"
echo "Generating Distribution Figures (Figure 3 parts b-d)"
Rscript /home/rtraborn/manuscripts/tsrchitect_figures/figure_repos/Figure_3/figure_3_graph.R
echo "Figure Completed"
echo "Success! Figure 3 workflow is complete"
##creates a pileup heatmap of motifs observed in promoters
## Created on 11 March 2015
#Dependencies (uncomment when necessary)
#install.packages( c("data.table","plyr","reshape2","ggplot2","gridBase","devtools"))
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("GenomicRanges","rtracklayer","impute","Rsamtools"))
#install the packages
library(devtools)
library(impute)
library(GenomicRanges)
#install the data package to be able to run examples in the vignette
#install_github("frenkiboy/genomationData",build_vignettes=FALSE)
library(genomation)
# create GRanges objects for TSR and TATA elements, respectively
#set working directory:
setwd("/DATA/GROUP/rtraborn/tsrchitect_test")
#importing the appropriate TATA bedfile
TATA_bed <- read.table(file="TATA_1k_window.bed",header=FALSE)
TATA_ind <- which(TATA_bed[,7] >= -200 & TATA_bed[,7]<= 50)
print(length(TATA_ind))
total_index <- 1:nrow(TATA_bed)
no.TATA <- total_index[-TATA_ind]
print(length(no.TATA))
#naming the appropriate BED columns
names(TATA_bed) <- c("chr","start", "end","id","score","strand","distance")
#importinug the appropriate Initiator (Inr) bedfile
Inr_bed <- read.table(file="Inr_1k_window.bed",header=FALSE)
Inr_ind <- which(Inr_bed[,7] >= -50 & Inr_bed[,7]<=10) #11 Mar 6:39PM these logicals need fixing- only returning a small fraction of promoters
print(length(Inr_ind))
print(head(Inr_ind))
total_index <- 1:nrow(Inr_bed)
no.Inr <- total_index[-Inr_ind]
print(length(no.Inr))
print(head(no.Inr))
#naming the appropriate BED columns
names(Inr_bed) <- c("chr","start", "end","id","score","strand","distance")
#importing the appropriate Y-patch bedfile
YY_bed <- read.table(file="YY_1k_window.bed",header=FALSE)
YY_ind <- which(YY_bed[,7]>= -250 & YY_bed[,7] <=50)
total_index <- 1:nrow(Inr_bed)
no.YY <- total_index[-YY_ind]
#Naming the appropriate BED columns
names(YY_bed) <- c("chr","start", "end","id","score","strand","distance")
#crating GenomicRanges object from the above motif bedfiles
TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), strand=strand,id=id))
Inr_gr <- with(Inr_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id))
YY_gr <- with(YY_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id))
#importing the putative promoter annotations that were made by TSRchitect
At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_mid_flank2.bed",header=T)
#naming the columns from the above BED file
names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint")
#creating a GenomicRanges object from the At_PEAT_TSRs bed file
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id))
#creating a scoring matrix between TSRs and TATA elements
#TSR_TATA <- ScoreMatrix(target=TATA_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)
#creating a scoring matrix between TSRs and Inr elements
#TSR_Inr <- ScoreMatrix(target=Inr_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)
#creating a scoring matrix between TSRs and Inr elements
#TSR_YY <- ScoreMatrix(target=YY_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50)
#creating a list of targets
targets <- list(TATA=TATA_gr,Inr=Inr_gr,YY=YY_gr)
promoters <- TSRs_bed
#visualizing the overlaps using a heat map
sm1=ScoreMatrixList(target=targets,windows=promoters,strand.aware=TRUE)
sm2=ScoreMatrix(target=TATA_gr,windows=promoters,strand.aware=TRUE)
#sm3=ScoreMatrix(target=YY_gr,windows=promoters,strand.aware=TRUE)
png(file="pileup_heatmap.png")
#par(mfrow=c(1,3))
#multiHeatMatrix(sm1,xcoords=c(-1000,1000),order=TRUE)
heatMatrix(sm2,xcoords=c(-1000,1000),k=3,kmeans=TRUE)
#heatMatrix(sm3,xcoords=c(-1000,1000),order=TRUE)
png(file="meta_heatmap.png")
#par(mfrow=c(1,3))
#multiHeatMatrix(sml.scaled,kmeans=TRUE,k=8,cex.axis=0.8,xcoords=c(-1000,250),col=c("lightgray","blue"))
plotMeta(sm1,xcoords=c(-1000,1000))
#plotMeta(sm2,xcoords=c(-1000,1000))
#plotMeta(sm3,xcoords=c(-1000,1000))
dev.off()
#/usr/bin/sh
cd /DATA/GROUP/rtraborn/tsrchitect_test
echo "Finding the closest motifs to the TSR midpoint using closestBed"
closestBed -b TATA_agris_tair10_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_TATA_1k_window.bed
closestBed -b Inr_pprom_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_Inr_1k_window.bed
closestBed -b Y_patch_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_YY_1k_window.bed
echo "Processing the output into motif-focused bed files"
cut -f 8-14 TSR_TATA_1k_window.bed > TATA_1k_window.bed
cut -f 8-14 TSR_Inr_1k_window.bed > Inr_1k_window.bed
cut -f 8-14 TSR_YY_1k_window.bed > YY_1k_window.bed
echo "Success! Workflow is complete"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment