Skip to content

Instantly share code, notes, and snippets.

@avilella
Created May 12, 2016 12:09
Show Gist options
  • Save avilella/869c1565c5ed8ad7d176c3dc1c0ba657 to your computer and use it in GitHub Desktop.
Save avilella/869c1565c5ed8ad7d176c3dc1c0ba657 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
my $dir;
my $debug; my $verbose; my $simulate;
my $cmd; my $ret;
my $self = bless {};
use File::Basename;
# ($name,$path,$suffix) = fileparse($inputfile,@suffixlist);
# $name = fileparse($fullname,@suffixlist);
# $basename = basename($fullname,@suffixlist);
# $dirname = dirname($fullname);
my $tag = 'multicorrelate_bed';
my $no_stats;
my $no_plots;
my $suffix = '.bed.gz';
my $bedtools = 'bedtools2';
my $sep = 'versus';
# See copy of make-matrix.py at the bottom of this script
my $make_matrix = "$ENV{HOME}/bin/make-matrix.py";
my $datamash = "$ENV{HOME}/datamash/bin/datamash";
GetOptions(
'i|d|dir:s' => \$dir,
'tag:s' => \$tag,
'suffix:s' => \$suffix,
'bedtools:s' => \$bedtools,
'datamash:s' => \$datamash,
'no_plots' => \$no_plots,
'debug' => \$debug,
'no_stats' => \$no_stats,
'verbose' => \$verbose,
'simulate' => \$simulate,
);
$cmd = "find -L $dir -name \"*$suffix\"";
print STDERR "# $cmd\n";
$ret = `$cmd`; chomp $ret;
my @beds = split("\n",$ret);
my $num_samples = scalar(@beds);
foreach my $inputfile1 (@beds) {
foreach my $inputfile2 (@beds) {
$self->bedtools_jaccard($inputfile1,$inputfile2);
}
}
my $basename = basename($dir);
my $outpairwise = "$dir/$basename.$tag.samples.$num_samples.txt";
# Pairwise Jaccard
$cmd = "find $dir -name \"*$sep*.jaccard.txt\" " . q{ | grep jaccard | xargs grep "" | perl -lne 's|}.$dir."/".q{||g; print' | perl -pi -e "s/\.}.$sep.q{\./\t/" | perl -pi -e "s/.jaccard.txt:/\t/" } . " > $outpairwise";
print STDERR "# $cmd\n";
$ret = `$cmd`; chomp $ret;
# Matrix of Jaccard Statistics
my $outmatrix = "$dir/$basename.$tag.samples.$num_samples.mat";
# . q{ | awk '$1 ~ /^f/ && $2 ~ /^f/' } . "
$cmd = "awk 'NF==3' $outpairwise | python $make_matrix > $outmatrix";
print STDERR "# $cmd\n";
$ret = `$cmd`; chomp $ret;
my $outlabels = "$dir/$basename.$tag.samples.$num_samples.labels.txt";
$cmd = "cut -f 1 $outmatrix > $outlabels";
print STDERR "# $cmd\n";
$ret = `$cmd`; chomp $ret;
exit(0) if ($no_plots);
use Statistics::R;
my $R = Statistics::R->new();
$R->startR;
# matrixfile="basename.multicorrelate_bed.samples.14.mat"
# labelsfile="basename.multicorrelate_bed.samples.14.labels.txt"
$cmd = "matrixfile=\"$outmatrix\""; $R->send($cmd);
$cmd = "labelsfile=\"$outlabels\""; $R->send($cmd);
my $pcafile = "$dir/$basename.$tag.samples.$num_samples.pca.pdf";
$cmd = "pdf(\"$pcafile\",width=10,height=5)"; $R->send($cmd); print $R->read . "\n";
$cmd = <<EOF;
x <- read.table(matrixfile)
labels <- read.table(labelsfile)
library(ggplot2)
library(RColorBrewer)
blues <- colorRampPalette(c('dark blue', 'light blue'))
greens <- colorRampPalette(c('dark green', 'light green'))
reds <- colorRampPalette(c('pink', 'dark red'))
ngroups <- length(unique(labels))
pca <- princomp(x)
qplot(pca\$scores[,1], pca\$scores[,2], color=labels[,1], geom="point", size=1) + scale_color_manual(values = c(blues(4), greens(5), reds(5)))
dev.off()
EOF
$R->send($cmd);
print STDERR "pcafile:\n";
print "$pcafile\n";
$cmd = "matrixfile=\"$outmatrix\""; $R->send($cmd);
my $heatmapfile = "$dir/$basename.$tag.samples.$num_samples.heatmap.pdf";
$cmd = "pdf(\"$heatmapfile\",width=10,height=10)"; $R->send($cmd); print $R->read . "\n";
$cmd = <<EOF;
x <- read.table(matrixfile)
library(gplots)
library(RColorBrewer)
jaccard_table <- x[, -1]
jaccard_matrix <- as.matrix(jaccard_table)
heatmap.2(jaccard_matrix, col = brewer.pal(9,"Blues"), margins = c(14, 14), density.info = "none", lhei=c(2, 8), trace= "none")
dev.off()
EOF
$R->send($cmd);
$R->stopR();
print STDERR "heatmapfile:\n";
print "$heatmapfile\n";
$DB::single=1;1;
########################################
sub bedtools_jaccard {
my $self = shift;
my $inputfile1 = shift;
my $inputfile2 = shift;
my ($name1,$path1,$suffix1) = fileparse($inputfile1,$suffix);
my $root1 = $name1; $root1 =~ s/_S\d+_L00[1-8]_R[12]_001//;
my ($name2,$path2,$suffix2) = fileparse($inputfile2,$suffix);
my $root2 = $name2; $root2 =~ s/_S\d+_L00[1-8]_R[12]_001//;
my $outpref = "$dir/$root1.$sep.$root2";
my $outfile = "$outpref.jaccard.tsv";
$cmd = "$bedtools jaccard -a $inputfile1 -b $inputfile2 > $outfile";
print STDERR "# $cmd\n";
$ret = `$cmd`; chomp $ret;
$cmd = "cat $outfile | awk 'NR>1' | cut -f 3 > $outpref.jaccard.txt";
print STDERR "# $cmd\n";
$ret = `$cmd`; chomp $ret;
return undef;
}
my $outfile = $path . $name . ".$tag.bed.gz";
$cmd = "bedtools2 genomecov -ibam $inputfile -bga | grep -w " . q{ '0$' } . " | gzip -c > $outfile";
print STDERR "# $cmd\n";
$ret = `$cmd`;
print STDERR "outfile\n";
print "$outfile\n";
unless ($no_stats) {
my $statsfile = $path . $name . ".$tag.stats.txt";
$cmd = "gunzip -c $outfile | " . q{ awk '{$5 = $3-$2; print $5}' }. " | $datamash --header-out count 1 sum 1 mean 1 median 1 min 1 max 1 sstdev 1 q1 1 q3 1 iqr 1 pskew 1 sskew 1 pkurt 1 skurt 1 dpo 1 jarque 1 ". q{ | perl -lne 's/field-1/} . $root . q{/g; print' } . "| $datamash transpose";
print STDERR "# $cmd\n";
$ret = `$cmd`;
open STATS, ">$statsfile" or die $!;
print STATS $ret;
close STATS;
print STDERR "statsfile\n";
print "$statsfile\n";
}
$DB::single=1;1;
$DB::single=1;1;
# run_multicorrelate_bed.pl
#
# Cared for by Albert Vilella <avilella@gmail.com>
#
# Copyright Albert Vilella
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
run_multicorrelate_bed.pl - DESCRIPTION
=head1 SYNOPSIS
See Description
=head1 DESCRIPTION
Look for bed files in a directory and run bedtools jaccard
all-against-all. Produce a matrix for PCA and heatmap plotting
Dependencies:
bedtools version 2.+ - https://github.com/arq5x/bedtools2 [required]
datamash - https://www.gnu.org/software/datamash/download [optional]
R + library(ggplot2) + library(gplots) + library(RColorBrewer) [optional]
Perl Options:
GetOptions(
'i|d|dir:s' => \$dir,
'tag:s' => \$tag,
'suffix:s' => \$suffix,
'datamash:s' => \$datamash,
'debug' => \$debug,
'no_stats' => \$no_stats,
'verbose' => \$verbose,
'simulate' => \$simulate,
);
=head1 AUTHOR - Albert Vilella
Email avilella@gmail.com
Describe contact details here
=head1 CONTRIBUTORS
Additional contributors names and emails here
=cut
__DATA__
# make-matrix.py
#!/usr/bin/python env
import sys
import collections
matrix = collections.defaultdict(dict)
for line in sys.stdin:
fields = line.strip().split()
matrix[fields[0]][fields[1]] = float(fields[2])
keys = sorted(matrix.keys())
sys.stdout.write("\t" + "\t".join(keys) + '\n')
for k in keys:
sys.stdout.write(k)
for j in keys:
sys.stdout.write('\t' + str(matrix[k][j]))
sys.stdout.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment