laceysanderson/prefix_fasta.pl

## prefix_fasta.pl
#!/user/bin/perl -w
use strict;
use warnings;

# Purpose: Prefix all records names in a FASTA file.
# Arguements:
# 1. Input FASTA file
# 2. Output FASTA File
# 3. Prefix that you would like added to all the FASTA records.

my $filein = $ARGV[0];
my $fileout = $ARGV[1];
my $prefix = $ARGV[2];

print "\nInput File: ".$filein."\n";
print "Output File: ".$fileout."\n";
print "Prefix: ".$prefix."\n";

# Determine filesize, etc. to help with reporting of progress.
my $wclines = `wc -l $filein`;
$wclines =~ s/ .*$//g;
chomp($wclines);
my $grepc = `grep -c ">" $filein`;
$grepc =~ s/^\s+|\s+$//g;

print "\nExpected:\n  - $wclines lines\n  - $grepc headers\n";

# Open files.
open(IN, '<', $filein) || die "Error: $!\n";
open(OUT, '>', $fileout) || die "Error: $!\n";

# For each line of the input file...
my $lines = 0;
my $headers = 0;
while (<IN>) {
    $lines++;
    # Note: If $2 doesn't match anything (ie. no whitespace in seq IDs) then there will be
    # a trailing whitespace after each seq ID in the new file.
    if (/^>(\S+)\s*(.*)/) {
      $headers++;
	    print OUT ">".$prefix.$1.' '.$2."\n";
    }
    # This is not a FASTA header so just print it out.
    else {
	    print OUT $_;
    }
}

print "\nProcessed:\n  - $lines lines\n  - $headers headers\n";

# Basic error reporting.
if ($wclines != $lines) {
    print "ERROR: the number of LINES processed differs from the number of lines expected ($lines versus $wclines)\n";
}
if ($grepc != $headers) {
    print "ERROR: the number of FASTA HEADERS processed differs from the number of headers expected ($headers versus $grepc)\n";
}

close(IN);
close(OUT);
exit();
	#!/user/bin/perl -w
	use strict;
	use warnings;

	# Purpose: Prefix all records names in a FASTA file.
	# Arguements:
	# 1. Input FASTA file
	# 2. Output FASTA File
	# 3. Prefix that you would like added to all the FASTA records.

	my $filein = $ARGV[0];
	my $fileout = $ARGV[1];
	my $prefix = $ARGV[2];

	print "\nInput File: ".$filein."\n";
	print "Output File: ".$fileout."\n";
	print "Prefix: ".$prefix."\n";

	# Determine filesize, etc. to help with reporting of progress.
	my $wclines = `wc -l $filein`;
	$wclines =~ s/ .*$//g;
	chomp($wclines);
	my $grepc = `grep -c ">" $filein`;
	$grepc =~ s/^\s+\|\s+$//g;

	print "\nExpected:\n - $wclines lines\n - $grepc headers\n";

	# Open files.
	open(IN, '<', $filein) \|\| die "Error: $!\n";
	open(OUT, '>', $fileout) \|\| die "Error: $!\n";

	# For each line of the input file...
	my $lines = 0;
	my $headers = 0;
	while (<IN>) {
	$lines++;
	# Note: If $2 doesn't match anything (ie. no whitespace in seq IDs) then there will be
	# a trailing whitespace after each seq ID in the new file.
	if (/^>(\S+)\s(.)/) {
	$headers++;
	print OUT ">".$prefix.$1.' '.$2."\n";
	}
	# This is not a FASTA header so just print it out.
	else {
	print OUT $_;
	}
	}

	print "\nProcessed:\n - $lines lines\n - $headers headers\n";

	# Basic error reporting.
	if ($wclines != $lines) {
	print "ERROR: the number of LINES processed differs from the number of lines expected ($lines versus $wclines)\n";
	}
	if ($grepc != $headers) {
	print "ERROR: the number of FASTA HEADERS processed differs from the number of headers expected ($headers versus $grepc)\n";
	}

	close(IN);
	close(OUT);
	exit();