Created
February 1, 2015 05:04
-
-
Save marioroy/29d4cd90a310d63ff219 to your computer and use it in GitHub Desktop.
Display bioID and recordLength (for FASTA files *.fa, *.fasta)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
## use threads; ## Uncomment if threads is desired | |
use MCE::Flow 1.600; ## Requires v1.600 to run | |
## | |
## time CKSIZE=1 DOSLRP=0 NPROCS=4 ./fasta_id.pl hg19.fa | wc -l | |
## time CKSIZE=64m DOSLRP=1 NPROCS=4 ./fasta_id.pl hg19.fa | wc -l | |
## | |
## time CKSIZE=4m DOSLRP=0 NPROCS=8 ./fasta_id.pl uniref.fa | wc -l | |
## time CKSIZE=4m DOSLRP=1 NPROCS=8 ./fasta_id.pl uniref.fa | wc -l | |
## | |
my $use_slurpio = $ENV{DOSLRP} || '0'; | |
my $chunk_size = $ENV{CKSIZE} || '2'; | |
my $max_workers = $ENV{NPROCS} || '4'; | |
sub output_iter { | |
my ($output_fh) = @_; | |
my %tmp; my $order_id = 1; | |
## One can have this receive 2 arguments; $chunk_id and $chunk_data. | |
## However, MCE->freeze is called when more than 1 argument is sent. | |
## For performance, $chunk_id is attached to the end of $_[0]. | |
return sub { | |
my $chunk_id = substr($_[0], rindex($_[0], ':') + 1); | |
my $chop_len = length($chunk_id) + 1; | |
substr($_[0], -$chop_len, $chop_len, ''); | |
if ($chunk_id == $order_id && keys %tmp == 0) { | |
print {$output_fh} $_[0]; | |
$order_id++; | |
} | |
else { | |
$tmp{$chunk_id} = $_[0]; | |
while (1) { | |
last unless exists $tmp{$order_id}; | |
print {$output_fh} delete $tmp{$order_id++}; | |
} | |
} | |
return; | |
}; | |
} | |
sub process_slurp { | |
my ($slurp_ref, $chunk_id) = @_; | |
my $bio_id; my $first_rec = 1; my $output = ''; | |
local $/ = "\n>"; | |
open my $mem_fh, '<', $slurp_ref; | |
while (my $rec = <$mem_fh>) { | |
chop $rec if (substr($rec,-1,1) eq '>'); | |
($bio_id) = ($rec ) =~ /^(\S+)/; | |
if ($first_rec) { | |
$first_rec--; | |
# print "[$chunk_id] $bio_id " . length($rec) . "\n"; | |
# print "$bio_id " . length($rec) . "\n"; | |
$output .= "$bio_id " . length($rec) . "\n"; | |
} | |
else { | |
# print "[$chunk_id] >$bio_id " . (length($rec) + 1) . "\n"; | |
# print ">$bio_id " . (length($rec) + 1) . "\n"; | |
$output .= ">$bio_id " . (length($rec) + 1) . "\n"; | |
} | |
} | |
close $mem_fh; | |
MCE->gather($output . ':' . $chunk_id); | |
} | |
mce_flow_f { | |
chunk_size => $chunk_size, max_workers => $max_workers, | |
RS => "\n>", use_slurpio => $use_slurpio, | |
gather => output_iter(\*STDOUT), | |
}, | |
sub { | |
my ($mce, $chunk_ref, $chunk_id) = @_; | |
my $bio_id; | |
if (ref $chunk_ref eq 'SCALAR') { | |
process_slurp($chunk_ref, $chunk_id); | |
} | |
elsif (MCE->chunk_size == 1) { | |
($bio_id) = ($_) =~ /^(\S+)/; | |
MCE->gather("$bio_id " . length($_) . "\n:$chunk_id"); | |
} | |
else { | |
my $output = ''; | |
for (0 .. @{ $chunk_ref } - 1) { | |
($bio_id) = ($chunk_ref->[$_]) =~ /^(\S+)/; | |
# print "[$chunk_id] $bio_id " . length($chunk_ref->[$_]) . "\n"; | |
# print "$bio_id " . length($chunk_ref->[$_]) . "\n"; | |
$output .= "$bio_id " . length($chunk_ref->[$_]) . "\n"; | |
} | |
MCE->gather($output . ':' . $chunk_id); | |
} | |
}, shift || \*STDIN; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment