Santhilal Subhash decodebiology

## baplot.R
# Bland-Altman plot R function.
# Author: jmmateos@mce.hggm.es
baplot <- function(m1, m2, ...) {
    # m1 and m2 are the measurements
    means <- (m1 + m2) / 2
    diffs <- m1 - m2
    mdiff <- mean(diffs)
    sddiff <- sd(diffs)

    # Compute the figure limits

## EnsemblGTFtoAnnotation.sh
echo `date`;
dir=(`dirname $1`)

base=(`basename $1`)


echo "1/7 Preparing files";
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '!x[$6]++' | awk '{print $6"\t"$0;}' > $dir/enst_annotation.tmp;

## sample.GTF
chr1	processed_transcript	exon	11869	12227	.	+	.	gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; exon_id "ENSE00002234944";
chr1	processed_transcript	exon	12613	12721	.	+	.	gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "2"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; exon_id "ENSE00003582793";
chr1	processed_transcript	exon	13221	14409	.	+	.	gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "3"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; exon_id "ENSE00002312635";
chr1	transcribed_unprocessed_pseudogene	exon	11872	12227	.	+	.	gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "1"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-201"; exon_id "ENSE00002234632";
chr1	transcribed_unprocessed_pseudogene	exon	12613	12721	.	+	.	gene_id "ENSG000002

## sample.GTF_ensg_annotation.txt
ENSG00000223972	chr1	11869	14412	+	DDX11L1	pseudogene
ENSG00000227232	chr1	14363	29806	-	WASH7P	pseudogene
ENSG00000243485	chr1	29554	31109	+	MIR1302-11	lincRNA

## sample.GTF_enst_annotation.txt
ENST00000423562	chr1	14363	29370	-	ENSG00000227232	WASH7P	pseudogene
ENST00000438504	chr1	14363	29370	-	ENSG00000227232	WASH7P	pseudogene
ENST00000450305	chr1	12010	13670	+	ENSG00000223972	DDX11L1	pseudogene
ENST00000456328	chr1	11869	14409	+	ENSG00000223972	DDX11L1	pseudogene
ENST00000469289	chr1	30267	31109	+	ENSG00000243485	MIR1302-11	lincRNA
ENST00000473358	chr1	29554	31097	+	ENSG00000243485	MIR1302-11	lincRNA
ENST00000488147	chr1	14404	29570	-	ENSG00000227232	WASH7P	pseudogene
ENST00000515242	chr1	11872	14412	+	ENSG00000223972	DDX11L1	pseudogene
ENST00000518655	chr1	11874	14409	+	ENSG00000223972	DDX11L1	pseudogene
ENST00000538476	chr1	14411	29806	-	ENSG00000227232	WASH7P	pseudogene

## heatmap_ChIP-seq.r
# This R script is to generate the TF or histone modification heatmap
# at certain genomic features (TSS, enhancers) from the ChIP-seq data
# the input matrix is got from Homer software. alternative to R, use cluster3 to cluster, and visualize by # java Treeviewer
# generate the matrix by Homer: annotatePeaks.pl peak_file.txt hg19 -size 6000 -hist 10  -ghist -d TF1/ # > outputfile_matrix.txt
# see several posts for heatmap:
# http://davetang.org/muse/2010/12/06/making-a-heatmap-with-r/
# http://www.r-bloggers.com/r-using-rcolorbrewer-to-colour-your-figures-in-r/
# 08/20/13 by Tommy Tang

# it is such a simple script but took me several days to get it work...I mean the desired

## heatmap_ChIP-seq_2.r
#The X axis is -3 kb to 3 kb around TSS.

#It turns out that the heatmap.2 function use default hclust ( Hierachical Clustering) to cluster the #matrix.
#alternatively, we can use K-means clustering to cluster the data and to see what's the pattern look like.

km<- kmeans(m1,2)   # determine how many cluster you want, I specify 2 here

m1.kmeans<- cbind(m1, km$cluster)   # combine the cluster with the matrix

dim(m1.kmeans)

## heatmap_ChIP-seq_3.r
m.row.sum<- cbind(m1, rowSums(m1))
o1<- rev(order(m.row.sum[,602]))

m.row.sum<- m.row.sum[o1,]

bk = unique(c(seq(-0.1,3, length=100),seq(3,10.35,length=100)))

hmcols<- colorRampPalette(c("white","red"))(length(bk)-1)

pheatmap( m.row.sum[,1:601], cluster_rows = F, cluster_cols = F, col= hmcols, breaks = bk, legend=FALSE, show_rownames=FALSE, show_colnames=FALSE)

## confidence_interval.R
#USAGE: CI_normal(matrix[,c(x:y)],97.5) - multiple column from a normal distribution
#USAGE: CI_tdist(matrix[,c(x:y)],97.5) - multiple column from a t distribution
#USAGE: CI_normal(matrix[,x],97.5) - single column from a normal distribution
#USAGE: CI_tdist(matrix[,x],97.5) - single column from a t distribution
#Created and updated by Santhilal Subhash on 2016/06/07
CI_normal <- function(li,stat)
{
	cat(paste0("CI","\t","column","\t","Lower.limit","\t","Upper.limit","\n"))
	if(length(colnames(li))>1)
	{

## CpG_feature_extraction.pl
#!/usr/bin/perl
$/=undef;
## Run script: perl CpG_feature_extraction.pl CpG_islands.bed shores/shelves (This gives warnings for 'rm' command, just ignore it)
# Reference for definition of shores and shelves from:
# http://www.mdpi.com/2073-4425/5/2/347/htm
# http://www.mdpi.com/genes/genes-05-00347/article_deploy/html/images/genes-05-00347-g001-1024.png
extend_coordinates($ARGV[0] ,$ARGV[1]);
sub extend_coordinates
{
	$type=$ARGV[1];
	# Bland-Altman plot R function.
	# Author: jmmateos@mce.hggm.es
	baplot <- function(m1, m2, ...) {
	# m1 and m2 are the measurements
	means <- (m1 + m2) / 2
	diffs <- m1 - m2
	mdiff <- mean(diffs)
	sddiff <- sd(diffs)

	# Compute the figure limits
	echo `date`;
	dir=(`dirname $1`)

	base=(`basename $1`)



	echo "1/7 Preparing files";
	cat $1 \| sed 's/ "/\t/g' \| sed 's/"; /\t/g' \| cut -f1,4,5,7,10,12,16,18,22 \| sed 's/";//' \| awk '!x[$6]++' \| awk '{print $6"\t"$0;}' > $dir/enst_annotation.tmp;
	chr1 processed_transcript exon 11869 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; exon_id "ENSE00002234944";
	chr1 processed_transcript exon 12613 12721 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "2"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; exon_id "ENSE00003582793";
	chr1 processed_transcript exon 13221 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "3"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; exon_id "ENSE00002312635";
	chr1 transcribed_unprocessed_pseudogene exon 11872 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "1"; gene_name "DDX11L1"; gene_biotype "pseudogene"; transcript_name "DDX11L1-201"; exon_id "ENSE00002234632";
	chr1 transcribed_unprocessed_pseudogene exon 12613 12721 . + . gene_id "ENSG000002
	ENSG00000223972 chr1 11869 14412 + DDX11L1 pseudogene
	ENSG00000227232 chr1 14363 29806 - WASH7P pseudogene
	ENSG00000243485 chr1 29554 31109 + MIR1302-11 lincRNA
	ENST00000423562 chr1 14363 29370 - ENSG00000227232 WASH7P pseudogene
	ENST00000438504 chr1 14363 29370 - ENSG00000227232 WASH7P pseudogene
	ENST00000450305 chr1 12010 13670 + ENSG00000223972 DDX11L1 pseudogene
	ENST00000456328 chr1 11869 14409 + ENSG00000223972 DDX11L1 pseudogene
	ENST00000469289 chr1 30267 31109 + ENSG00000243485 MIR1302-11 lincRNA
	ENST00000473358 chr1 29554 31097 + ENSG00000243485 MIR1302-11 lincRNA
	ENST00000488147 chr1 14404 29570 - ENSG00000227232 WASH7P pseudogene
	ENST00000515242 chr1 11872 14412 + ENSG00000223972 DDX11L1 pseudogene
	ENST00000518655 chr1 11874 14409 + ENSG00000223972 DDX11L1 pseudogene
	ENST00000538476 chr1 14411 29806 - ENSG00000227232 WASH7P pseudogene
	# This R script is to generate the TF or histone modification heatmap
	# at certain genomic features (TSS, enhancers) from the ChIP-seq data
	# the input matrix is got from Homer software. alternative to R, use cluster3 to cluster, and visualize by # java Treeviewer
	# generate the matrix by Homer: annotatePeaks.pl peak_file.txt hg19 -size 6000 -hist 10 -ghist -d TF1/ # > outputfile_matrix.txt
	# see several posts for heatmap:
	# http://davetang.org/muse/2010/12/06/making-a-heatmap-with-r/
	# http://www.r-bloggers.com/r-using-rcolorbrewer-to-colour-your-figures-in-r/
	# 08/20/13 by Tommy Tang

	# it is such a simple script but took me several days to get it work...I mean the desired
	#The X axis is -3 kb to 3 kb around TSS.

	#It turns out that the heatmap.2 function use default hclust ( Hierachical Clustering) to cluster the #matrix.
	#alternatively, we can use K-means clustering to cluster the data and to see what's the pattern look like.

	km<- kmeans(m1,2) # determine how many cluster you want, I specify 2 here

	m1.kmeans<- cbind(m1, km$cluster) # combine the cluster with the matrix

	dim(m1.kmeans)
	m.row.sum<- cbind(m1, rowSums(m1))
	o1<- rev(order(m.row.sum[,602]))

	m.row.sum<- m.row.sum[o1,]

	bk = unique(c(seq(-0.1,3, length=100),seq(3,10.35,length=100)))

	hmcols<- colorRampPalette(c("white","red"))(length(bk)-1)

	pheatmap( m.row.sum[,1:601], cluster_rows = F, cluster_cols = F, col= hmcols, breaks = bk, legend=FALSE, show_rownames=FALSE, show_colnames=FALSE)
	#USAGE: CI_normal(matrix[,c(x:y)],97.5) - multiple column from a normal distribution
	#USAGE: CI_tdist(matrix[,c(x:y)],97.5) - multiple column from a t distribution
	#USAGE: CI_normal(matrix[,x],97.5) - single column from a normal distribution
	#USAGE: CI_tdist(matrix[,x],97.5) - single column from a t distribution
	#Created and updated by Santhilal Subhash on 2016/06/07
	CI_normal <- function(li,stat)
	{
	cat(paste0("CI","\t","column","\t","Lower.limit","\t","Upper.limit","\n"))
	if(length(colnames(li))>1)
	{
	#!/usr/bin/perl
	$/=undef;
	## Run script: perl CpG_feature_extraction.pl CpG_islands.bed shores/shelves (This gives warnings for 'rm' command, just ignore it)
	# Reference for definition of shores and shelves from:
	# http://www.mdpi.com/2073-4425/5/2/347/htm
	# http://www.mdpi.com/genes/genes-05-00347/article_deploy/html/images/genes-05-00347-g001-1024.png
	extend_coordinates($ARGV[0] ,$ARGV[1]);
	sub extend_coordinates
	{
	$type=$ARGV[1];