Last active
August 29, 2015 14:11
-
-
Save rtraborn/3be132fbd17aa4377288 to your computer and use it in GitHub Desktop.
Figure 4: Validation of TSR Annotations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generating the figure showing the distribution between identified TSRs and TATA motifs | |
# Written on 1/9/2015 | |
#loading lattice | |
library(lattice) | |
#setting the working directory | |
setwd("/DATA/GROUP/rtraborn/tsrchitect_test/") | |
#reading the distribution file (created by closeestBed; see documentation) into R | |
TSR_genes_dist <- read.table(file="distribution_TSR_genes_output.txt",header=FALSE) | |
#applying column names to the imported data frame | |
names(TSR_genes_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance") | |
#making sure that everything is on the up and up with the file (for debugging purposes) | |
head(TSR_genes_dist) | |
summary(TSR_genes_dist$dist) | |
cat(paste("There are", nrow(TSR_genes_dist),"comparisons in this sample.","\n",sep=" ")) | |
cat("\n") | |
#setting up plotting device | |
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/") | |
png(file="figure_4b_TSR_genes_dist.png",width=960,height=960) | |
plot.new() | |
d <- density(TSR_genes_dist$distance) | |
plot(d,xlab="Position of genes relative to identified TSRs \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of genes relative to putative promoters identified using TSRchitect",col="darkgrey",lwd=3,xlim=c(-200,200)) | |
dev.off() | |
#reading the distribution file (created by closeestBed; see documentation) into R | |
TSR_TATA_dist <- read.table(file="distribution_TSR_TATA_output.txt",header=FALSE) | |
#applying column names to the imported data frame | |
names(TSR_TATA_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance") | |
#removing rows that have distances that are greater than [-100,50] | |
distribution_index1 <- which(TSR_TATA_dist$distance > -100) | |
TSR_TATA_dist_mod <- TSR_TATA_dist[distribution_index1,] | |
distribution_index2 <- which(TSR_TATA_dist_mod$distance < 50) | |
TSR_TATA_dist_mod <- TSR_TATA_dist_mod[distribution_index2,] | |
#making sure that everything is on the up and up with the file (for debugging purposes) | |
head(TSR_TATA_dist_mod) | |
summary(TSR_TATA_dist_mod$dist) | |
cat(paste("There are", nrow(TSR_TATA_dist_mod),"TATA promoters in this sample.","\n",sep=" ")) | |
cat("\n") | |
#setting up plotting device | |
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/") | |
png(file="figure_4c_TSR_data_dist.png",width=960,height=960) | |
plot.new() | |
d <- density(TSR_TATA_dist_mod$distance) | |
plot(d,xlab="Position of TATA motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of TATA elements relative to putative promoters identified using TSRchitect",col="darkred",lwd=3) | |
dev.off() | |
#reading the distribution file (created by closeestBed; see documentation) into R | |
TSR_Inr_dist <- read.table(file="distribution_TSR_Inr_output.txt",header=FALSE) | |
#applying column names to the imported data frame | |
names(TSR_Inr_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance") | |
#removing rows that have distances that are greater than 25 | |
distribution_index1 <- which(TSR_Inr_dist$distance > -25) | |
TSR_Inr_dist_mod <- TSR_Inr_dist[distribution_index1,] | |
distribution_index2 <- which(TSR_Inr_dist_mod$distance < 10) | |
TSR_Inr_dist_mod <- TSR_Inr_dist_mod[distribution_index2,] | |
#making sure that everything is on the up and up with the file (for debugging purposes) | |
head(TSR_Inr_dist_mod) | |
summary(TSR_Inr_dist_mod$distance) | |
cat(paste("There are", nrow(TSR_Inr_dist_mod),"Initiator promoters in this sample.","\n",sep=" ")) | |
cat("\n") | |
#setting up plotting device- Inr | |
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/") | |
png(file="figure_4d_TSR_data_dist.png",width=960,height=960) | |
plot.new() | |
d <- density(TSR_Inr_dist_mod$distance) | |
plot(d,xlab="Position of Inr motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Initiator (Inr) elements relative to putative promoters identified using TSRchitect",col="darkgreen",lwd=3) | |
dev.off() | |
#reading the distribution file (created by closeestBed; see documentation) into R | |
TSR_YPatch_dist <- read.table(file="distribution_TSR_YPatch_output.txt",header=FALSE) | |
#applying column names to the imported data frame | |
names(TSR_YPatch_dist) <- c("Chr","start1","stop1","gene_name","TSS_count","strand","midpoint","chr2","start2","stop2","motif_name","motif_score","strand2","distance") | |
#removing rows that have distances that are greater than [-100,50] | |
distribution_index1 <- which(TSR_YPatch_dist$distance > -100) | |
TSR_YPatch_dist_mod <- TSR_YPatch_dist[distribution_index1,] | |
distribution_index2 <- which(TSR_YPatch_dist_mod$distance < 50) | |
TSR_YPatch_dist_mod <- TSR_YPatch_dist_mod[distribution_index2,] | |
#making sure that everything is on the up and up with the file (for debugging purposes) | |
head(TSR_YPatch_dist_mod) | |
summary(TSR_YPatch_dist_mod$distance) | |
cat(paste("There are", nrow(TSR_YPatch_dist_mod),"Y-patch promoters in this sample.","\n",sep=" ")) | |
cat("\n") | |
#setting up plotting device- Y-Patch | |
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_4/") | |
png(file="figure_4e_TSR_data_dist.png",width=960,height=960) | |
plot.new() | |
d <- density(TSR_YPatch_dist_mod$distance) | |
plot(d,xlab="Position of Y-Patch motif relative to identified TSR \n Genomic distance in basepairs (bp)",ylab="TSR Density",main="Distribution of Y-Patch (YY) elements relative to putative promoters identified using TSRchitect",col="gold",lwd=3) | |
dev.off() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code needed to generate Figure 3, which is a validation or correlation between annotated TSRs and elements consistent with promoter | |
#loading the dependencies | |
library('GenomicRanges') | |
#setting the working directory as appropriate | |
setwd("/DATA/GROUP/rtraborn/tsrchitect_test/") | |
#importing the appropriate data files | |
TATA_bed <- read.table(file="TATA_agris_tair10_flank3.bed",header=FALSE) | |
#naming the appropriate BED columns | |
names(TATA_bed) <- c("chr","source","id","start", "end","motif","score","strand") | |
#crating GenomicRanges object from the TATA_bed file | |
TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), id=id)) | |
#importing the putative promoter annotations that were made by TSRchitect | |
At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_final3.bed",header=T) | |
#naming the columns from the above BED file | |
names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint") | |
#creating a GenomicRanges object from the At_PEAT_TSRs bed file | |
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id)) | |
#finding the overlaps between TSRs and TATA elements | |
TSR_TATA_ol <- findOverlaps(TSRs_bed,TATA_gr) | |
#this should display the OL file | |
TSR_TATA_ol | |
## this works fine to this point 1/2/2015; flanking regions have been added to the TATA files | |
##need to write an anchorplot workflow so the distributions can be visualized | |
as.data.frame(TSR_TATA_ol) -> TSR_TATA_ol_df | |
print(head(TSR_TATA_ol_df)) #taking a look at what the data frame looks like | |
print(head(TSRs_bed)) #ibid | |
print(head(TATA_gr)) #ibid | |
#currently (1/6/15) at 4:50PM works without issue. Now need to write the distribution script, with midpoint vs TATA boxes | |
##invoking bedtools within R | |
#file names | |
a.file <- tempfile() | |
b.file <- tempfile() | |
out <- tempfile() | |
options(scipen=99) #turns off scientific notation | |
#optional flags for the bedtools command | |
opt.string <- c("-s -D a") | |
#writing bed-formatted dataframes to tempfiles | |
write.table("atPEAT_TSRs_final3.bed",sep="\t",file=a.file,quote=F,col.names=F,row.names=F) | |
write.table("TATA_agris_tair10_complete_final2.bed",sep="\t",file=b.file,quote=F,col.names=F,row.names=F) | |
#the actual bed command | |
bedtools_closest <- paste("closestBed","-a",a.file,"-b",b.file,opt.string,">",out,sep=" ") | |
cat(bedtools_closest,"\n") | |
try(system(bedtools_closest)) | |
res <- read.table(out,header=F) | |
unlink(a.file);unlink(b.file);unlink(out) | |
print(head(res)) #let's see what the output of this looks like | |
##current issue when this script runs | |
#> try(system(bedtools_closest)) | |
#It looks as though you have less than 3 columns at line: 1. Are you sure your files are tab-delimited? | |
#> | |
#> res <- read.table(out,header=F) | |
#Error in read.table(out, header = F) : no lines available in input | |
#possible solution: https://groups.google.com/forum/#!topic/bedtools-discuss/XshPkgD-7Ig |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/sh | |
cd /DATA/GROUP/rtraborn/tsrchitect_test | |
echo "Calculating the distance between each identified TSR and coding genes in A. thaliana" | |
closestBed -b TAIR10_genes.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_genes_output.txt | |
echo "Calculating the distance between each identified TSR and TATA motifs in A. thaliana" | |
closestBed -b TATA_agris_tair10.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_TATA_output.txt | |
mv distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/ | |
echo "Calculating the distance between each identified TSR and Initiator (Inr) motifs in A. thaliana" | |
closestBed -b Inr_pprom_tair10_final.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_Inr_output.txt | |
mv distribution_TSR_Inr_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/ | |
echo "Calculating the distance between each identified TSR and Y-Patch (YY) motifs in A. thalaina" | |
closestBed -b Y_patch_tair10_final.bed -a atPEAT_TSRs_mid.bed -s -D a > distribution_TSR_YPatch_output.txt | |
mv distribution_TSR_YPatch_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/ | |
echo "Now starting the analysis for Rice (Oryza sativa japonica)" | |
cd /DATA/GROUP/rtraborn/tsrchitect_test | |
echo "Calculating the distance between each identified TSR and coding genes in O. sativa" | |
closestBed -b o_sativa_genes.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_genes_output.txt | |
mv os_distribution_TSR_genes_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/ | |
echo "Calculating the distance between each identified TSR and TATA motifs in O. sativa" | |
closestBed -b os_TATA_agris_final.bed -a osTSRs_final2.bed -s -D a > os_distribution_TSR_TATA_output.txt | |
mv os_distribution_TSR_TATA_output.txt /home/rtraborn/manuscripts/figure_repos/tsrchitect_figures/Figure_3/ | |
echo "closestBed Analysis is complete" | |
echo "Generating Distribution Figures (Figure 3 parts b-d)" | |
Rscript /home/rtraborn/manuscripts/tsrchitect_figures/figure_repos/Figure_3/figure_3_graph.R | |
echo "Figure Completed" | |
echo "Success! Figure 3 workflow is complete" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##creates a pileup heatmap of motifs observed in promoters | |
## Created on 11 March 2015 | |
#Dependencies (uncomment when necessary) | |
#install.packages( c("data.table","plyr","reshape2","ggplot2","gridBase","devtools")) | |
#source("http://bioconductor.org/biocLite.R") | |
#biocLite(c("GenomicRanges","rtracklayer","impute","Rsamtools")) | |
#install the packages | |
library(devtools) | |
library(impute) | |
library(GenomicRanges) | |
#install the data package to be able to run examples in the vignette | |
#install_github("frenkiboy/genomationData",build_vignettes=FALSE) | |
library(genomation) | |
# create GRanges objects for TSR and TATA elements, respectively | |
#set working directory: | |
setwd("/DATA/GROUP/rtraborn/tsrchitect_test") | |
#importing the appropriate TATA bedfile | |
TATA_bed <- read.table(file="TATA_1k_window.bed",header=FALSE) | |
TATA_ind <- which(TATA_bed[,7] >= -200 & TATA_bed[,7]<= 50) | |
print(length(TATA_ind)) | |
total_index <- 1:nrow(TATA_bed) | |
no.TATA <- total_index[-TATA_ind] | |
print(length(no.TATA)) | |
#naming the appropriate BED columns | |
names(TATA_bed) <- c("chr","start", "end","id","score","strand","distance") | |
#importinug the appropriate Initiator (Inr) bedfile | |
Inr_bed <- read.table(file="Inr_1k_window.bed",header=FALSE) | |
Inr_ind <- which(Inr_bed[,7] >= -50 & Inr_bed[,7]<=10) #11 Mar 6:39PM these logicals need fixing- only returning a small fraction of promoters | |
print(length(Inr_ind)) | |
print(head(Inr_ind)) | |
total_index <- 1:nrow(Inr_bed) | |
no.Inr <- total_index[-Inr_ind] | |
print(length(no.Inr)) | |
print(head(no.Inr)) | |
#naming the appropriate BED columns | |
names(Inr_bed) <- c("chr","start", "end","id","score","strand","distance") | |
#importing the appropriate Y-patch bedfile | |
YY_bed <- read.table(file="YY_1k_window.bed",header=FALSE) | |
YY_ind <- which(YY_bed[,7]>= -250 & YY_bed[,7] <=50) | |
total_index <- 1:nrow(Inr_bed) | |
no.YY <- total_index[-YY_ind] | |
#Naming the appropriate BED columns | |
names(YY_bed) <- c("chr","start", "end","id","score","strand","distance") | |
#crating GenomicRanges object from the above motif bedfiles | |
TATA_gr <- with(TATA_bed,GRanges(chr, IRanges(start, end), strand=strand,id=id)) | |
Inr_gr <- with(Inr_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id)) | |
YY_gr <- with(YY_bed,GRanges(chr,IRanges(start, end), strand=strand,id=id)) | |
#importing the putative promoter annotations that were made by TSRchitect | |
At_PEAT_TSRs <- read.table(file="atPEAT_TSRs_mid_flank2.bed",header=T) | |
#naming the columns from the above BED file | |
names(At_PEAT_TSRs) <- c("chr","start","end","id","Num_TSSs","strand","midpoint") | |
#creating a GenomicRanges object from the At_PEAT_TSRs bed file | |
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, midpoint, id=id)) | |
#creating a scoring matrix between TSRs and TATA elements | |
#TSR_TATA <- ScoreMatrix(target=TATA_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50) | |
#creating a scoring matrix between TSRs and Inr elements | |
#TSR_Inr <- ScoreMatrix(target=Inr_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50) | |
#creating a scoring matrix between TSRs and Inr elements | |
#TSR_YY <- ScoreMatrix(target=YY_gr,windows=TSRs_bed,strand.aware=TRUE,bin.num=50) | |
#creating a list of targets | |
targets <- list(TATA=TATA_gr,Inr=Inr_gr,YY=YY_gr) | |
promoters <- TSRs_bed | |
#visualizing the overlaps using a heat map | |
sm1=ScoreMatrixList(target=targets,windows=promoters,strand.aware=TRUE) | |
sm2=ScoreMatrix(target=TATA_gr,windows=promoters,strand.aware=TRUE) | |
#sm3=ScoreMatrix(target=YY_gr,windows=promoters,strand.aware=TRUE) | |
png(file="pileup_heatmap.png") | |
#par(mfrow=c(1,3)) | |
#multiHeatMatrix(sm1,xcoords=c(-1000,1000),order=TRUE) | |
heatMatrix(sm2,xcoords=c(-1000,1000),k=3,kmeans=TRUE) | |
#heatMatrix(sm3,xcoords=c(-1000,1000),order=TRUE) | |
png(file="meta_heatmap.png") | |
#par(mfrow=c(1,3)) | |
#multiHeatMatrix(sml.scaled,kmeans=TRUE,k=8,cex.axis=0.8,xcoords=c(-1000,250),col=c("lightgray","blue")) | |
plotMeta(sm1,xcoords=c(-1000,1000)) | |
#plotMeta(sm2,xcoords=c(-1000,1000)) | |
#plotMeta(sm3,xcoords=c(-1000,1000)) | |
dev.off() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/sh | |
cd /DATA/GROUP/rtraborn/tsrchitect_test | |
echo "Finding the closest motifs to the TSR midpoint using closestBed" | |
closestBed -b TATA_agris_tair10_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_TATA_1k_window.bed | |
closestBed -b Inr_pprom_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_Inr_1k_window.bed | |
closestBed -b Y_patch_tair10_final_sorted.bed -a atPEAT_TSRs_mid.bed -s -D a -t first > TSR_YY_1k_window.bed | |
echo "Processing the output into motif-focused bed files" | |
cut -f 8-14 TSR_TATA_1k_window.bed > TATA_1k_window.bed | |
cut -f 8-14 TSR_Inr_1k_window.bed > Inr_1k_window.bed | |
cut -f 8-14 TSR_YY_1k_window.bed > YY_1k_window.bed | |
echo "Success! Workflow is complete" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment