Skip to content

Instantly share code, notes, and snippets.

@marioroy
Created February 1, 2015 05:04
Show Gist options
  • Save marioroy/29d4cd90a310d63ff219 to your computer and use it in GitHub Desktop.
Save marioroy/29d4cd90a310d63ff219 to your computer and use it in GitHub Desktop.
Display bioID and recordLength (for FASTA files *.fa, *.fasta)
#!/usr/bin/env perl
use strict;
use warnings;
## use threads; ## Uncomment if threads is desired
use MCE::Flow 1.600; ## Requires v1.600 to run
##
## time CKSIZE=1 DOSLRP=0 NPROCS=4 ./fasta_id.pl hg19.fa | wc -l
## time CKSIZE=64m DOSLRP=1 NPROCS=4 ./fasta_id.pl hg19.fa | wc -l
##
## time CKSIZE=4m DOSLRP=0 NPROCS=8 ./fasta_id.pl uniref.fa | wc -l
## time CKSIZE=4m DOSLRP=1 NPROCS=8 ./fasta_id.pl uniref.fa | wc -l
##
my $use_slurpio = $ENV{DOSLRP} || '0';
my $chunk_size = $ENV{CKSIZE} || '2';
my $max_workers = $ENV{NPROCS} || '4';
sub output_iter {
my ($output_fh) = @_;
my %tmp; my $order_id = 1;
## One can have this receive 2 arguments; $chunk_id and $chunk_data.
## However, MCE->freeze is called when more than 1 argument is sent.
## For performance, $chunk_id is attached to the end of $_[0].
return sub {
my $chunk_id = substr($_[0], rindex($_[0], ':') + 1);
my $chop_len = length($chunk_id) + 1;
substr($_[0], -$chop_len, $chop_len, '');
if ($chunk_id == $order_id && keys %tmp == 0) {
print {$output_fh} $_[0];
$order_id++;
}
else {
$tmp{$chunk_id} = $_[0];
while (1) {
last unless exists $tmp{$order_id};
print {$output_fh} delete $tmp{$order_id++};
}
}
return;
};
}
sub process_slurp {
my ($slurp_ref, $chunk_id) = @_;
my $bio_id; my $first_rec = 1; my $output = '';
local $/ = "\n>";
open my $mem_fh, '<', $slurp_ref;
while (my $rec = <$mem_fh>) {
chop $rec if (substr($rec,-1,1) eq '>');
($bio_id) = ($rec ) =~ /^(\S+)/;
if ($first_rec) {
$first_rec--;
# print "[$chunk_id] $bio_id " . length($rec) . "\n";
# print "$bio_id " . length($rec) . "\n";
$output .= "$bio_id " . length($rec) . "\n";
}
else {
# print "[$chunk_id] >$bio_id " . (length($rec) + 1) . "\n";
# print ">$bio_id " . (length($rec) + 1) . "\n";
$output .= ">$bio_id " . (length($rec) + 1) . "\n";
}
}
close $mem_fh;
MCE->gather($output . ':' . $chunk_id);
}
mce_flow_f {
chunk_size => $chunk_size, max_workers => $max_workers,
RS => "\n>", use_slurpio => $use_slurpio,
gather => output_iter(\*STDOUT),
},
sub {
my ($mce, $chunk_ref, $chunk_id) = @_;
my $bio_id;
if (ref $chunk_ref eq 'SCALAR') {
process_slurp($chunk_ref, $chunk_id);
}
elsif (MCE->chunk_size == 1) {
($bio_id) = ($_) =~ /^(\S+)/;
MCE->gather("$bio_id " . length($_) . "\n:$chunk_id");
}
else {
my $output = '';
for (0 .. @{ $chunk_ref } - 1) {
($bio_id) = ($chunk_ref->[$_]) =~ /^(\S+)/;
# print "[$chunk_id] $bio_id " . length($chunk_ref->[$_]) . "\n";
# print "$bio_id " . length($chunk_ref->[$_]) . "\n";
$output .= "$bio_id " . length($chunk_ref->[$_]) . "\n";
}
MCE->gather($output . ':' . $chunk_id);
}
}, shift || \*STDIN;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment