rtraborn/Figure_S2_plot.png

## Figure_S2_plot.png

      
    Raw
  

              Figure_S2_plot.png
            
          
## figure_S2_workflow.R
## Code to generate Figure 2 ##  current code generates a two-panel figure; the lower panel (SD) is problematic at this point (12/12/14)

#loading appropriate packages
library('GenomicRanges')

#loading R script
source("tsr_overlaps.R")

#setting the working directory
setwd("/DATA/GROUP/rtraborn/tsrchitect_test")

#removing scientific notation
options(scipen=999)

#importing the mapped, PEAT-derived TSSs from cagep
read.table(file="A_thaliana_PEAT.cagep",header=F) -> At_PEAT_TSSs

#importing the putative promoter annotations from TSRchitect
read.table(file="atpeatbed.bed",header=F) -> At_PEAT_TSRs

#naming the BED columns from the above
c("chr","start","end","id","Num_TSSs","strand") -> names(At_PEAT_TSRs)

#creating a GenomicRanges object from the At_PEAT_TSRs bed file
TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, Num_TSSs, id=id))

#will print top of object #works in previous (12/8) commit
TSRs_bed

#naming the columns from the cagep file with TSSs
c("chr","id","strand","start","end") -> names(At_PEAT_TSSs)

#creating a GenomicRanges objects from the At_PEAT_TSSs file
TSSs_PEAT <- with(At_PEAT_TSSs, GRanges(chr, IRanges(start,end), strand,id=id))

#this will print top of TSSs_PEAT object #worked without issue 12/9
TSSs_PEAT

#finding the overlaps between TSRs and TSSs
findOverlaps(TSRs_bed,TSSs_PEAT) -> TSR_ol

#running tsr_overlaps
tsr_overlaps(TSR_ol,At_PEAT_TSSs,TSRs_bed) -> complete_df

#printing the output
print(head(complete_df))

#setting working directory to our git repo
setwd("/home/rtraborn/manuscripts/figure_repos/Figure_2")

#preparing the figure
png(file="Figure_2_plot.png",height=1800,width=900)

par(mfrow=c(2, 1)) #setting up two figures, above and below

na.omit(complete_df$entropy) -> entropy_vec
na.omit(complete_df$var) -> var_vec

plot(density(entropy_vec), main = "Distribution of Promoter Entropy within promoters",xlab="Shannon Entropy of TSSs in Putative Promoters (TSRs)\n Most uniform -> Most Diverse",ylab="Density",col="red2")
plot(density(sd_vec), main = "Distribution of the variances of TSSs within promoters", xlab="Variances of TSSs in Putative Promoters (TSRs)\n (n=5097 genes) ", ylab="Density",col="blue2",xlim=c(0,100000))

dev.off()


## tsr_overlaps.R
###iterates over a findOverlaps() GRanges object and calculates the shannon entropy score and standard deviation for each TSR

tsr_overlaps <- function(overlaps,cagepGR,TSR) {
library('entropy')

overlaps_df <- as.data.frame(overlaps)
TSS_df <- as.data.frame(cagepGR)
TSR_df <- as.data.frame(TSR)

final_df <- array(NA,c(nrow(TSR_df),4))

       for (i in unique(overlaps_df[,1])) {
           i -> this_TSR
           print(this_TSR)
           which(overlaps_df[,1]==this_TSR) -> TSS_index
           TSS_df[TSS_index,] -> TSS_slice
           if (TSS_slice$strand[1]=="+") { #for positive strand
               TSS_slice$start -> TSS_list
               }
           if (TSS_slice$strand[1]=="-") { #for negative strand
               TSS_slice$end -> TSS_list
               }

TSR_df$id[1] -> this_gene
as.numeric(TSS_list) -> TSS_list
var(TSS_list) -> var_TSS
#print(var_TSS) #for debugging
entropy(TSS_list,method="ML",unit="log2") -> shan_ent_TSS
#print(shan_ent_TSS) #for debugging
length(TSS_index) -> num_TSS

final_df[i,] <- c(this_gene,var_TSS,shan_ent_TSS,num_TSS)
       }

final_df <- as.data.frame(final_df)
names(final_df) <- c("gene","var","entropy","nTSS") #adding the names to the data_frame

return(final_df)
}
	## Code to generate Figure 2 ## current code generates a two-panel figure; the lower panel (SD) is problematic at this point (12/12/14)

	#loading appropriate packages
	library('GenomicRanges')

	#loading R script
	source("tsr_overlaps.R")

	#setting the working directory
	setwd("/DATA/GROUP/rtraborn/tsrchitect_test")

	#removing scientific notation
	options(scipen=999)

	#importing the mapped, PEAT-derived TSSs from cagep
	read.table(file="A_thaliana_PEAT.cagep",header=F) -> At_PEAT_TSSs

	#importing the putative promoter annotations from TSRchitect
	read.table(file="atpeatbed.bed",header=F) -> At_PEAT_TSRs

	#naming the BED columns from the above
	c("chr","start","end","id","Num_TSSs","strand") -> names(At_PEAT_TSRs)

	#creating a GenomicRanges object from the At_PEAT_TSRs bed file
	TSRs_bed <- with(At_PEAT_TSRs, GRanges(chr, IRanges(start, end), strand, Num_TSSs, id=id))

	#will print top of object #works in previous (12/8) commit
	TSRs_bed

	#naming the columns from the cagep file with TSSs
	c("chr","id","strand","start","end") -> names(At_PEAT_TSSs)

	#creating a GenomicRanges objects from the At_PEAT_TSSs file
	TSSs_PEAT <- with(At_PEAT_TSSs, GRanges(chr, IRanges(start,end), strand,id=id))

	#this will print top of TSSs_PEAT object #worked without issue 12/9
	TSSs_PEAT

	#finding the overlaps between TSRs and TSSs
	findOverlaps(TSRs_bed,TSSs_PEAT) -> TSR_ol

	#running tsr_overlaps
	tsr_overlaps(TSR_ol,At_PEAT_TSSs,TSRs_bed) -> complete_df

	#printing the output
	print(head(complete_df))

	#setting working directory to our git repo
	setwd("/home/rtraborn/manuscripts/figure_repos/Figure_2")

	#preparing the figure
	png(file="Figure_2_plot.png",height=1800,width=900)

	par(mfrow=c(2, 1)) #setting up two figures, above and below

	na.omit(complete_df$entropy) -> entropy_vec
	na.omit(complete_df$var) -> var_vec

	plot(density(entropy_vec), main = "Distribution of Promoter Entropy within promoters",xlab="Shannon Entropy of TSSs in Putative Promoters (TSRs)\n Most uniform -> Most Diverse",ylab="Density",col="red2")
	plot(density(sd_vec), main = "Distribution of the variances of TSSs within promoters", xlab="Variances of TSSs in Putative Promoters (TSRs)\n (n=5097 genes) ", ylab="Density",col="blue2",xlim=c(0,100000))

	dev.off()
	###iterates over a findOverlaps() GRanges object and calculates the shannon entropy score and standard deviation for each TSR

	tsr_overlaps <- function(overlaps,cagepGR,TSR) {
	library('entropy')

	overlaps_df <- as.data.frame(overlaps)
	TSS_df <- as.data.frame(cagepGR)
	TSR_df <- as.data.frame(TSR)

	final_df <- array(NA,c(nrow(TSR_df),4))

	for (i in unique(overlaps_df[,1])) {
	i -> this_TSR
	print(this_TSR)
	which(overlaps_df[,1]==this_TSR) -> TSS_index
	TSS_df[TSS_index,] -> TSS_slice
	if (TSS_slice$strand[1]=="+") { #for positive strand
	TSS_slice$start -> TSS_list
	}
	if (TSS_slice$strand[1]=="-") { #for negative strand
	TSS_slice$end -> TSS_list
	}

	TSR_df$id[1] -> this_gene
	as.numeric(TSS_list) -> TSS_list
	var(TSS_list) -> var_TSS
	#print(var_TSS) #for debugging
	entropy(TSS_list,method="ML",unit="log2") -> shan_ent_TSS
	#print(shan_ent_TSS) #for debugging
	length(TSS_index) -> num_TSS

	final_df[i,] <- c(this_gene,var_TSS,shan_ent_TSS,num_TSS)
	}

	final_df <- as.data.frame(final_df)
	names(final_df) <- c("gene","var","entropy","nTSS") #adding the names to the data_frame

	return(final_df)
	}