slavailn/match_fasta_files.pl

## match_fasta_files.pl
#! /usr/bin/perl

# Match 2 fasta files by id or by sequence
# print out common and unique entries

use strict; use warnings;
use Getopt::Long;
use List::MoreUtils qw(any);

# Variables available via command line options
my $file1;
my $file2;
my $match_by_id = 0;
my $match_by_seq = 0;
my $help = 0;

GetOptions ("file1=s" => \$file1, # first fasta file
            "file2=s" => \$file2, # second fasta file
            "match_by_id" => \$match_by_id, # match fasta sequences by id
            "match_by_seq" => \$match_by_seq,
            "help|h" => \$help
           ) # match fasta sequences by id
           or die &print_help;

if ( $help == 1 )
{
    &print_help;
}

##############################################################################################################
# MAIN BODY

# Hashes to store fasta entries from both files
my %file1_hash = ();
my %file2_hash = ();

# open fasta file
open( my $fh1, "<", $file1 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n");
open( my $fh2, "<", $file2 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n");

# Move fasta sequences from the first file to a hash
my $fasta1_id;
my $seq1;
while( <$fh1> )
{
    chomp;
    if ( /^>.+/ )
    {
        $fasta1_id = $_;
        $file1_hash{$fasta1_id} = ();
    }
    elsif ( /\A[ATCGN]+\z/i )
    {
        $seq1 = $_;
        $file1_hash{$fasta1_id} = $seq1;
    }
    else
    {
        die ( "Malformed identifier or sequence contains forbidden characters: $_\n" );
    }
}

# Move fasta sequences from the second file to a hash
my $fasta2_id;
my $seq2;
while( <$fh2> )
{
    chomp;
    if ( /^>.+/ )
    {
        $fasta2_id = $_;
        $file2_hash{$fasta2_id} = ();
    }
    elsif ( /\A^[ATCGN]+\z/i )
    {
        $seq2 = $_;
        $file2_hash{$fasta2_id} = $seq2;
    }
    else
    {
        die ( "Malformed identifier or sequence contains forbidden characters: $_\n" );
    }
}

if ( $match_by_id == 1 && $match_by_seq == 0 )
{
    &run_id_match(\%file1_hash, \%file2_hash);
}
elsif ( $match_by_id == 0 && $match_by_seq == 1 )
{
    &run_seq_match(\%file1_hash, \%file2_hash);
}
else
{
    print "You have to specify one of the options, either --match_by_id or --match_by_seq\n";
    &print_help;
}


#################################################################################################################
# SUBROUTINES
sub print_help
{
    print "\n";
    print "USAGE: perl match_fasta_files --file1 <file1.fasta> --file2 <file2.fasta> --match_by_seq --match_by_id\n";
    print "NOTE: --match_by_seq and --match_by_id flags are mutually exclusive\n";
    print "OUTPUT: This script will compare two fasta files either by id or by sequence (these two options are mutually exclusive) and output 3 fasta files:\n";
    print "unique_to_file1.fasta - entries unique to file1\n";
    print "unique_to_file2.fasta - entries unique to file2\n";
    print "common.fasta - present in both files\n";
    print "HELP: use -h OR --help to print this message and exit\n";
    print "\n";
    exit;
}

sub run_id_match
# Iterate over both fasta hashes and
# and find entries (by ids only, sequences can be different) unique to first and second
# fasta file as well as entries common between files
{
    my $file1_hash_ref = shift;
    my $file2_hash_ref = shift;
    my $id;
    my $seq;
    for ( keys %$file1_hash_ref )
    {
        if ( exists $file2_hash_ref->{$_} )
        {
            my $id = $_;
            my $seq = $file1_hash_ref->{$_};
            &print_to_common($id, $seq);
        }
        else
        {
           my $id = $_;
           my $seq = $file1_hash_ref->{$_};
           &print_to_unique_file1($id, $seq);
        }
    }

    for ( keys %$file2_hash_ref )
    {
        if ( exists $file1_hash_ref->{$_} )
        {
            next;
        }
        else
        {
           my $id = $_;
           my $seq = $file2_hash_ref->{$_};
           &print_to_unique_file2($id, $seq);
        }
    }

}


sub run_seq_match
# Iterate over both fasta hashes and
# and find entries (by sequences only ids can be different) unique to first and second
# fasta file as well as entries common between files
# Common sequence entries will written to file will be taken from file1, only a single
# sequence occurence will printed out, we don't care which one.
{
    my $file1_hash_ref = shift;
    my $file2_hash_ref = shift;
    my $id;
    my $seq;
    for my $value ( values %$file1_hash_ref )
    {
        if ( any {$_ eq $value } values %$file2_hash_ref ) # check if any sequences from the file1 are also seen in file2
        {
            my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref;
            $id  = $keys[0];
            $seq = $value;
            &print_to_common($id, $seq);
        }
        else
        {
            my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref;
            $id = $keys[0];
            $seq = $value;
            &print_to_unique_file1($id, $seq);
        }

    }

    for my $value ( values %$file2_hash_ref )
    {
        if ( any {$_ eq $value } values %$file1_hash_ref ) # check if any sequences from the file2 are seen in file1
        {
            next; # skip if yes, we are only looking for unique sequences in file2
        }
        else
        {
            my @keys = grep { $file2_hash_ref->{$_} eq $value } keys %$file2_hash_ref;
            $id = $keys[0];
            $seq = $value;
            &print_to_unique_file2($id, $seq);
        }

    }

}


sub print_to_common
# Print entries common to both files
{
    my $id    = shift;
    my $seq   = shift;

    my $common_file_name = "common_to_both_files.fasta";
    open( my $common_fh, ">>", $common_file_name ) or die "Cannot create output file for common sequences! $!\n";
    print {$common_fh} "$id\n";
    print {$common_fh} "$seq\n";
}

sub print_to_unique_file1
# Print entries unique to file1
{
    my $id   = shift;
    my $seq  = shift;

    my $unique_to_file1_name = join("", "unique_to_", $file1);
    open( my $unique_to_file1_fh, ">>", $unique_to_file1_name ) or die "Cannot create output file for unique to file1 sequences! $!\n";
    print {$unique_to_file1_fh} "$id\n";
    print {$unique_to_file1_fh} "$seq\n";
}

sub print_to_unique_file2
# Print entries unique to file2
{
    my $id   = shift;
    my $seq  = shift;
    my $unique_to_file2_name = join("", "unique_to_", $file2);
    open( my $unique_to_file2_fh, ">>", $unique_to_file2_name ) or die "Cannot create output file for unique to file1 sequences! $!\n";
    print {$unique_to_file2_fh} "$id\n";
    print {$unique_to_file2_fh} "$seq\n";
}
	#! /usr/bin/perl

	# Match 2 fasta files by id or by sequence
	# print out common and unique entries

	use strict; use warnings;
	use Getopt::Long;
	use List::MoreUtils qw(any);

	# Variables available via command line options
	my $file1;
	my $file2;
	my $match_by_id = 0;
	my $match_by_seq = 0;
	my $help = 0;

	GetOptions ("file1=s" => \$file1, # first fasta file
	"file2=s" => \$file2, # second fasta file
	"match_by_id" => \$match_by_id, # match fasta sequences by id
	"match_by_seq" => \$match_by_seq,
	"help\|h" => \$help
	) # match fasta sequences by id
	or die &print_help;

	if ( $help == 1 )
	{
	&print_help;
	}

	##############################################################################################################
	# MAIN BODY

	# Hashes to store fasta entries from both files
	my %file1_hash = ();
	my %file2_hash = ();

	# open fasta file
	open( my $fh1, "<", $file1 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n");
	open( my $fh2, "<", $file2 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n");

	# Move fasta sequences from the first file to a hash
	my $fasta1_id;
	my $seq1;
	while( <$fh1> )
	{
	chomp;
	if ( /^>.+/ )
	{
	$fasta1_id = $_;
	$file1_hash{$fasta1_id} = ();
	}
	elsif ( /\A[ATCGN]+\z/i )
	{
	$seq1 = $_;
	$file1_hash{$fasta1_id} = $seq1;
	}
	else
	{
	die ( "Malformed identifier or sequence contains forbidden characters: $_\n" );
	}
	}

	# Move fasta sequences from the second file to a hash
	my $fasta2_id;
	my $seq2;
	while( <$fh2> )
	{
	chomp;
	if ( /^>.+/ )
	{
	$fasta2_id = $_;
	$file2_hash{$fasta2_id} = ();
	}
	elsif ( /\A^[ATCGN]+\z/i )
	{
	$seq2 = $_;
	$file2_hash{$fasta2_id} = $seq2;
	}
	else
	{
	die ( "Malformed identifier or sequence contains forbidden characters: $_\n" );
	}
	}

	if ( $match_by_id == 1 && $match_by_seq == 0 )
	{
	&run_id_match(\%file1_hash, \%file2_hash);
	}
	elsif ( $match_by_id == 0 && $match_by_seq == 1 )
	{
	&run_seq_match(\%file1_hash, \%file2_hash);
	}
	else
	{
	print "You have to specify one of the options, either --match_by_id or --match_by_seq\n";
	&print_help;
	}



	#################################################################################################################
	# SUBROUTINES
	sub print_help
	{
	print "\n";
	print "USAGE: perl match_fasta_files --file1 <file1.fasta> --file2 <file2.fasta> --match_by_seq --match_by_id\n";
	print "NOTE: --match_by_seq and --match_by_id flags are mutually exclusive\n";
	print "OUTPUT: This script will compare two fasta files either by id or by sequence (these two options are mutually exclusive) and output 3 fasta files:\n";
	print "unique_to_file1.fasta - entries unique to file1\n";
	print "unique_to_file2.fasta - entries unique to file2\n";
	print "common.fasta - present in both files\n";
	print "HELP: use -h OR --help to print this message and exit\n";
	print "\n";
	exit;
	}

	sub run_id_match
	# Iterate over both fasta hashes and
	# and find entries (by ids only, sequences can be different) unique to first and second
	# fasta file as well as entries common between files
	{
	my $file1_hash_ref = shift;
	my $file2_hash_ref = shift;
	my $id;
	my $seq;
	for ( keys %$file1_hash_ref )
	{
	if ( exists $file2_hash_ref->{$_} )
	{
	my $id = $_;
	my $seq = $file1_hash_ref->{$_};
	&print_to_common($id, $seq);
	}
	else
	{
	my $id = $_;
	my $seq = $file1_hash_ref->{$_};
	&print_to_unique_file1($id, $seq);
	}
	}

	for ( keys %$file2_hash_ref )
	{
	if ( exists $file1_hash_ref->{$_} )
	{
	next;
	}
	else
	{
	my $id = $_;
	my $seq = $file2_hash_ref->{$_};
	&print_to_unique_file2($id, $seq);
	}
	}

	}


	sub run_seq_match
	# Iterate over both fasta hashes and
	# and find entries (by sequences only ids can be different) unique to first and second
	# fasta file as well as entries common between files
	# Common sequence entries will written to file will be taken from file1, only a single
	# sequence occurence will printed out, we don't care which one.
	{
	my $file1_hash_ref = shift;
	my $file2_hash_ref = shift;
	my $id;
	my $seq;
	for my $value ( values %$file1_hash_ref )
	{
	if ( any {$_ eq $value } values %$file2_hash_ref ) # check if any sequences from the file1 are also seen in file2
	{
	my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref;
	$id = $keys[0];
	$seq = $value;
	&print_to_common($id, $seq);
	}
	else
	{
	my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref;
	$id = $keys[0];
	$seq = $value;
	&print_to_unique_file1($id, $seq);
	}

	}

	for my $value ( values %$file2_hash_ref )
	{
	if ( any {$_ eq $value } values %$file1_hash_ref ) # check if any sequences from the file2 are seen in file1
	{
	next; # skip if yes, we are only looking for unique sequences in file2
	}
	else
	{
	my @keys = grep { $file2_hash_ref->{$_} eq $value } keys %$file2_hash_ref;
	$id = $keys[0];
	$seq = $value;
	&print_to_unique_file2($id, $seq);
	}

	}

	}


	sub print_to_common
	# Print entries common to both files
	{
	my $id = shift;
	my $seq = shift;

	my $common_file_name = "common_to_both_files.fasta";
	open( my $common_fh, ">>", $common_file_name ) or die "Cannot create output file for common sequences! $!\n";
	print {$common_fh} "$id\n";
	print {$common_fh} "$seq\n";
	}

	sub print_to_unique_file1
	# Print entries unique to file1
	{
	my $id = shift;
	my $seq = shift;

	my $unique_to_file1_name = join("", "unique_to_", $file1);
	open( my $unique_to_file1_fh, ">>", $unique_to_file1_name ) or die "Cannot create output file for unique to file1 sequences! $!\n";
	print {$unique_to_file1_fh} "$id\n";
	print {$unique_to_file1_fh} "$seq\n";
	}

	sub print_to_unique_file2
	# Print entries unique to file2
	{
	my $id = shift;
	my $seq = shift;
	my $unique_to_file2_name = join("", "unique_to_", $file2);
	open( my $unique_to_file2_fh, ">>", $unique_to_file2_name ) or die "Cannot create output file for unique to file1 sequences! $!\n";
	print {$unique_to_file2_fh} "$id\n";
	print {$unique_to_file2_fh} "$seq\n";
	}