Skip to content

Instantly share code, notes, and snippets.

@ShujiaHuang
Last active February 26, 2016 01:36
Show Gist options
  • Save ShujiaHuang/0ffc43e06b525282b0a3 to your computer and use it in GitHub Desktop.
Save ShujiaHuang/0ffc43e06b525282b0a3 to your computer and use it in GitHub Desktop.
用于判定窗口长度,完成窗口定位和窗口内甲基化率计算,常用于甲基化Canonical分析
#Author : Shujia Huang
#Date : 2010/11/27
#!/usr/bin/perl -w
use strict;
use warnings;
my ( $file, $outfile_prefix, @bin_num ) = @ARGV;
my %region2num = ( "1000upstream" => 0, "first-exon" => 1, "intron" => 2,
"mid-exon" => 3, "last-exon" => 4, "1000downstream" => 5 );
my %final_data;
my @inf;
open ( I, $file ) or die "Cannot open : $file\n";
#1000upstream supercont2.12 + ureg_07933 1 737 0 67 CHH CAT 1.00 0 1 2
#1000upstream supercont2.12 + ureg_07933 1 737 0 100 CHH CAC 1.00 0 1 2
#1000upstream supercont2.12 + ureg_07933 1 737 0 102 CG CG 1.00 0 1 2
while ( <I> ) {
chomp;
@inf = split ( /\s+/, $_ );
$inf[0] = lc ( $inf[0] ); ##For any contingency
next if ( $inf[0] =~ /utr/i );
my @bin_length = decided_bin_length ( $inf[4], $inf[5], $bin_num[ $region2num{$inf[0]} ] );
my $win = decided_win ( $inf[4], $inf[5], $inf[7], \@bin_length); ## Window number counts from 0;
$win = ( $inf[2] eq '-') ? ( $bin_num[ $region2num{$inf[0]} ] - $win - 1 ) : $win;
my $region = $region2num{ $inf[0] };
my $pattern = join "",( (split (//, $inf[-5]))[0,1] );
$pattern = uc ($pattern);
$final_data{$pattern}{$inf[3]}{$region}[$win]->[0] += $inf[-3];
$final_data{$pattern}{$inf[3]}{$region}[$win]->[1] += $inf[-2];
$final_data{"all"}{$inf[3]}{$region}[$win]->[0] += $inf[-3];
$final_data{"all"}{$inf[3]}{$region}[$win]->[1] += $inf[-2];
}
close ( I );
#### Display
my %num2region = reverse %region2num;
for my $pattern ( keys %final_data ) {
my @gene_list = keys %{ $final_data{$pattern} };
my $site = 0;
open ( O, ">$outfile_prefix.$pattern" ) or die "Cannot write to file : $outfile_prefix.$pattern\n";
print O join "\t",( "region", "Window", @gene_list );print O "\n";
for my $region ( sort { $a<=> $b }keys %num2region ) {
for (my $i = 0; $i < $bin_num[$region]; ++$i ) {
++$site;
print O join "\t", ( $num2region{$region}, $site );
for my $gene ( @gene_list ) {
if ( !exists $final_data{$pattern}{$gene}{$region}[$i] ){
$final_data{$pattern}{$gene}{$region}[$i] = [0,0];
}
my $met = $final_data{$pattern}{$gene}{$region}[$i]->[0];
my $umet = $final_data{$pattern}{$gene}{$region}[$i]->[1];
my $met_level = ( $met + $umet ) ? $met / ( $met + $umet ) : -1;
print O "\t$met_level";
}
print O "\n";
}
}
close ( O );
}
#################################
######## Sub Function ###########
sub decided_bin_length {
my ( $start, $end, $bin_num ) = @_;
die "Region ERROR\n" if ( $end < $start );
my $length = $end - $start;
my $length_per_bin = int ( $length / $bin_num );
my $remainder = $length % $bin_num;
my @bin_length = ( $length_per_bin ) x $bin_num;
if ( $length < $bin_num ) {
@bin_length[0..($length-1)] = (1) x $length;
return @bin_length; ## Must return the array value now! Don't contiune or it'll get wrong result!
}
for ( my $i = 1; $i <= $remainder; ++$i ) {
$bin_length[$i - 1]++;
}
return @bin_length;
}
sub decided_win {
my ( $region_start, $region_end, $cout_site, $bin_length ) = @_;
my $delta = $cout_site - $region_start;
my $length = 0;
my $win;
my $i;
for ( $i = 0; $i < @$bin_length ; ++$i ) {
if ( $delta >= $length ) {
$length += $$bin_length[$i];
} else {
$win = ( $i > 0 ) ? $i - 1 : 0 if ( $delta >= 0);
last;
}
}
if ( $cout_site <= $region_end ) {
if ( !defined $win ) {
$win = $i - 1;
}
}
die "Out of region\nRegion: $region_start - $region_end\nCout file site: $cout_site\n" if ( !defined $win);
return $win;
}
@ShujiaHuang
Copy link
Author

Usage:perl pure_data.pl inputfile output 50 10 20 10 10 50

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment