Skip to content

Instantly share code, notes, and snippets.

@gingi
Created April 26, 2012 16:34
Show Gist options
  • Save gingi/2500840 to your computer and use it in GitHub Desktop.
Save gingi/2500840 to your computer and use it in GitHub Desktop.
A script for smoothing BED files.
#!/usr/local/bin/perl
use strict;
use warnings;
use Readonly;
use List::MoreUtils qw/any/;
use Getopt::Long;
use Pod::Usage;
Readonly my $CHR_COLUMN => 0;
Readonly my $START_COLUMN => 1;
Readonly my $END_COLUMN => 2;
Readonly my $SCORE_COLUMN => 3;
Readonly my $DEFAULT_WINDOW_SIZE => 5e3;
Readonly my $TALLY_COUNT => 1;
Readonly my $TALLY_ADD => 2;
MAIN: {
my ($window_size, $input_file, $output_file, $help, $man, $tally);
my ($infh, $outfh);
GetOptions(
"window|w=s" => \$window_size,
"input|i=s" => \$input_file,
"output|o=s" => \$output_file,
"help|h" => \$help,
"man|m" => \$man,
"count|c" => sub { $tally = $TALLY_COUNT },
"add|a" => sub { $tally = $TALLY_ADD }
) or pod2usage(-verbose => 1);
pod2usage(-verbose => 1) if ($help);
pod2usage(-verbose => 2) if ($man);
$window_size ||= $DEFAULT_WINDOW_SIZE;
$input_file ||= $ARGV[0];
$output_file ||= $ARGV[1];
if (defined($input_file)) {
die "Cannot find input file $input_file.\n" unless (-e $input_file);
die "Cannot open input file $input_file.\n" unless (-r $input_file);
open($infh, '<', $input_file)
or die "Can't open input file $input_file: $!\n";
} else {
$infh = \*STDIN;
}
if (defined($output_file)) {
open($outfh, '>', $output_file)
or die "Can't open output file $output_file: $!\n";
} else {
$outfh = \*STDOUT;
}
combine_intervals(
input => $infh,
output => $outfh,
window_size => $window_size,
tally => $tally
);
}
sub combine_intervals {
my %params = ref($_[0]) eq 'HASH' ? %{ $_[0] } : @_;
my $infh = $params{input};
my $current_chr = undef;
my $window
= window($params{output}, $params{window_size}, $params{tally});
while (my $line = <$infh>) {
chomp $line;
my @row = split(/\s+/, $line);
my $interval_size = $row[$END_COLUMN] - $row[$START_COLUMN] + 1;
if (!defined $current_chr || $row[$CHR_COLUMN] ne $current_chr) {
if ($window->buffer) {
$window->print;
}
$window->init($row[$CHR_COLUMN]);
$current_chr = $row[$CHR_COLUMN];
}
$window->add_value($row[$START_COLUMN], $row[$END_COLUMN],
$row[$SCORE_COLUMN]);
while ($window->end < $row[$END_COLUMN]) {
$window->print;
$window->shift;
$window->add_value($row[$START_COLUMN], $row[$END_COLUMN],
$row[$SCORE_COLUMN]);
}
}
if ($window->buffer) {
$window->print;
}
}
sub window {
my ($outfh, $size, $tally) = @_;
my $window_start = 0;
my $window_end = $size - 1;
my $score = 0;
my $chr = 5;
my $buffer = 0;
my $package = "Window";
no strict 'refs';
*{"${package}::init"} = sub {
my ($self, $c) = @_;
$chr = $c;
$window_start = 0;
$window_end = $size - 1;
$buffer = 1;
};
*{"${package}::score"} = sub {$score};
*{"${package}::start"} = sub {$window_start};
*{"${package}::end"} = sub {$window_end};
*{"${package}::shift"} = sub {
$window_start += $size;
$window_end += $size;
$score = 0;
$buffer = 1;
};
if ($tally) {
*{"${package}::print"} = sub {
printf $outfh "%-5s %9d %9d %6d\n", $chr, $window_start,
$window_end,
$score;
$buffer = 0;
};
my $func
= ($tally == $TALLY_COUNT)
? sub { $score++; }
: sub { $score += $_[0] };
*{"${package}::add_value"} = sub {
my ($self, $istart, $iend, $value) = @_;
$func->($value);
};
} else {
*{"${package}::print"} = sub {
printf $outfh "%-5s %9d %9d %6.5f\n", $chr, $window_start,
$window_end,
$score;
$buffer = 0;
};
*{"${package}::add_value"} = sub {
my ($self, $istart, $iend, $iscore) = @_;
if ($iscore =~ m/[^\d\.]/) {
$iscore = 0;
}
my $length = $iend - $istart + 1;
if ($window_end < $iend) {
$iscore *= ($window_end - $istart + 1) / $length;
} elsif ($window_start < $iend && $window_start > $iend) {
$iscore *= ($iend - $window_end + 1) / $length;
}
$score += $iscore;
$buffer = 1;
};
}
*{"${package}::buffer"} = sub {$buffer};
use strict 'refs';
return bless {}, $package;
}
=head1 NAME
smooth.pl - Smooths interval data for visualization.
=head1 SYNOPSIS
perl smooth.pl [options] input.bed output.bed
perl smooth.pl [options] < input.bed > output.bed
perl smooth.pl [options]
Options:
-h --help
-m --man
-w --window
-i --input
-o --output
-c --count
-a --add
=head1 OPTIONS
B<-h --help>
Print a brief help message and exit.
B<-m --man>
Print the man page and exit.
B<-w --window>
Specify the interval window size (Default: 5,000 bp).
B<-i --input>
An input BED file. If not specified, will use STDIN.
B<-o --output>
An output BED file. If not specified, will use STDOUT.
B<-c --count>
Count the intervals without using the value. This option is useful when the value is non-numeric and just the presence of intervals are meant to be significant (e.g., this region is a gene).
B<-a --add>
Use the numeric value to obtain the sum over a given window.
=head1 DESCRIPTION
B<smooth.pl>
Takes input (in BED format) of arbitrary values across dense genomic
intervals and generates new BED output of uniform length intervals wher the
values are smoothed using either a count (if the source value is boolean,
meaning it either exists or doesn't) or sum of numeric values.
=head1 AUTHOR
Shiran Pasternak (shiran@cshl.edu)
=head1 COPYRIGHT
Copyright (c) 2012 Cold Spring Harbor Laboratory.
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment