marioroy/fasta_id.pl

## fasta_id.pl
#!/usr/bin/env perl

use strict;
use warnings;

## use threads;                ## Uncomment if threads is desired

use MCE::Flow 1.600;           ## Requires v1.600 to run

##
##  time CKSIZE=1   DOSLRP=0 NPROCS=4 ./fasta_id.pl hg19.fa   | wc -l
##  time CKSIZE=64m DOSLRP=1 NPROCS=4 ./fasta_id.pl hg19.fa   | wc -l
##
##  time CKSIZE=4m  DOSLRP=0 NPROCS=8 ./fasta_id.pl uniref.fa | wc -l
##  time CKSIZE=4m  DOSLRP=1 NPROCS=8 ./fasta_id.pl uniref.fa | wc -l
##

my $use_slurpio = $ENV{DOSLRP} || '0';
my $chunk_size  = $ENV{CKSIZE} || '2';
my $max_workers = $ENV{NPROCS} || '4';

sub output_iter {

    my ($output_fh) = @_;
    my %tmp; my $order_id = 1;

    ## One can have this receive 2 arguments; $chunk_id and $chunk_data.
    ## However, MCE->freeze is called when more than 1 argument is sent.
    ## For performance, $chunk_id is attached to the end of $_[0].

    return sub {
        my $chunk_id = substr($_[0], rindex($_[0], ':') + 1);
        my $chop_len = length($chunk_id) + 1;

        substr($_[0], -$chop_len, $chop_len, '');

        if ($chunk_id == $order_id && keys %tmp == 0) {
            print {$output_fh} $_[0];
            $order_id++;
        }
        else {
            $tmp{$chunk_id} = $_[0];
            while (1) {
                last unless exists $tmp{$order_id};
                print {$output_fh} delete $tmp{$order_id++};
            }
        }

        return;
    };
}

sub process_slurp {

    my ($slurp_ref, $chunk_id) = @_;
    my $bio_id; my $first_rec = 1; my $output = '';
    local $/ = "\n>";

    open my $mem_fh, '<', $slurp_ref;
    while (my $rec = <$mem_fh>) {
        chop $rec if (substr($rec,-1,1) eq '>');
        ($bio_id) = ($rec ) =~ /^(\S+)/;

        if ($first_rec) {
            $first_rec--;
          # print "[$chunk_id] $bio_id " . length($rec) . "\n";
          # print "$bio_id " . length($rec) . "\n";

            $output .= "$bio_id " . length($rec) . "\n";
        }
        else {
          # print "[$chunk_id] >$bio_id " . (length($rec) + 1) . "\n";
          # print ">$bio_id " . (length($rec) + 1) . "\n";

            $output .= ">$bio_id " . (length($rec) + 1) . "\n";
        }
    }
    close $mem_fh;

    MCE->gather($output . ':' . $chunk_id);
}

mce_flow_f {
    chunk_size => $chunk_size, max_workers => $max_workers,
    RS => "\n>", use_slurpio => $use_slurpio,
    gather => output_iter(\*STDOUT),
},
sub {
    my ($mce, $chunk_ref, $chunk_id) = @_;
    my $bio_id;

    if (ref $chunk_ref eq 'SCALAR') {
        process_slurp($chunk_ref, $chunk_id);
    }
    elsif (MCE->chunk_size == 1) {
        ($bio_id) = ($_) =~ /^(\S+)/;
        MCE->gather("$bio_id " . length($_) . "\n:$chunk_id");
    }
    else {
        my $output = '';

        for (0 .. @{ $chunk_ref } - 1) {
            ($bio_id) = ($chunk_ref->[$_]) =~ /^(\S+)/;
          # print "[$chunk_id] $bio_id " . length($chunk_ref->[$_]) . "\n";
          # print "$bio_id " . length($chunk_ref->[$_]) . "\n";

            $output .= "$bio_id " . length($chunk_ref->[$_]) . "\n";
        }

        MCE->gather($output . ':' . $chunk_id);
    }

}, shift || \*STDIN;
	#!/usr/bin/env perl

	use strict;
	use warnings;

	## use threads; ## Uncomment if threads is desired

	use MCE::Flow 1.600; ## Requires v1.600 to run

	##
	## time CKSIZE=1 DOSLRP=0 NPROCS=4 ./fasta_id.pl hg19.fa \| wc -l
	## time CKSIZE=64m DOSLRP=1 NPROCS=4 ./fasta_id.pl hg19.fa \| wc -l
	##
	## time CKSIZE=4m DOSLRP=0 NPROCS=8 ./fasta_id.pl uniref.fa \| wc -l
	## time CKSIZE=4m DOSLRP=1 NPROCS=8 ./fasta_id.pl uniref.fa \| wc -l
	##

	my $use_slurpio = $ENV{DOSLRP} \|\| '0';
	my $chunk_size = $ENV{CKSIZE} \|\| '2';
	my $max_workers = $ENV{NPROCS} \|\| '4';

	sub output_iter {

	my ($output_fh) = @_;
	my %tmp; my $order_id = 1;

	## One can have this receive 2 arguments; $chunk_id and $chunk_data.
	## However, MCE->freeze is called when more than 1 argument is sent.
	## For performance, $chunk_id is attached to the end of $_[0].

	return sub {
	my $chunk_id = substr($_[0], rindex($_[0], ':') + 1);
	my $chop_len = length($chunk_id) + 1;

	substr($_[0], -$chop_len, $chop_len, '');

	if ($chunk_id == $order_id && keys %tmp == 0) {
	print {$output_fh} $_[0];
	$order_id++;
	}
	else {
	$tmp{$chunk_id} = $_[0];
	while (1) {
	last unless exists $tmp{$order_id};
	print {$output_fh} delete $tmp{$order_id++};
	}
	}

	return;
	};
	}

	sub process_slurp {

	my ($slurp_ref, $chunk_id) = @_;
	my $bio_id; my $first_rec = 1; my $output = '';
	local $/ = "\n>";

	open my $mem_fh, '<', $slurp_ref;
	while (my $rec = <$mem_fh>) {
	chop $rec if (substr($rec,-1,1) eq '>');
	($bio_id) = ($rec ) =~ /^(\S+)/;

	if ($first_rec) {
	$first_rec--;
	# print "[$chunk_id] $bio_id " . length($rec) . "\n";
	# print "$bio_id " . length($rec) . "\n";

	$output .= "$bio_id " . length($rec) . "\n";
	}
	else {
	# print "[$chunk_id] >$bio_id " . (length($rec) + 1) . "\n";
	# print ">$bio_id " . (length($rec) + 1) . "\n";

	$output .= ">$bio_id " . (length($rec) + 1) . "\n";
	}
	}
	close $mem_fh;

	MCE->gather($output . ':' . $chunk_id);
	}

	mce_flow_f {
	chunk_size => $chunk_size, max_workers => $max_workers,
	RS => "\n>", use_slurpio => $use_slurpio,
	gather => output_iter(\*STDOUT),
	},
	sub {
	my ($mce, $chunk_ref, $chunk_id) = @_;
	my $bio_id;

	if (ref $chunk_ref eq 'SCALAR') {
	process_slurp($chunk_ref, $chunk_id);
	}
	elsif (MCE->chunk_size == 1) {
	($bio_id) = ($_) =~ /^(\S+)/;
	MCE->gather("$bio_id " . length($_) . "\n:$chunk_id");
	}
	else {
	my $output = '';

	for (0 .. @{ $chunk_ref } - 1) {
	($bio_id) = ($chunk_ref->[$_]) =~ /^(\S+)/;
	# print "[$chunk_id] $bio_id " . length($chunk_ref->[$_]) . "\n";
	# print "$bio_id " . length($chunk_ref->[$_]) . "\n";

	$output .= "$bio_id " . length($chunk_ref->[$_]) . "\n";
	}

	MCE->gather($output . ':' . $chunk_id);
	}

	}, shift \|\| \*STDIN;