Skip to content

Instantly share code, notes, and snippets.

@slavailn
Created April 26, 2016 06:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slavailn/0428a746b1cbe0cc8cd3a67c2746321b to your computer and use it in GitHub Desktop.
Save slavailn/0428a746b1cbe0cc8cd3a67c2746321b to your computer and use it in GitHub Desktop.
#! /usr/bin/perl
# Match 2 fasta files by id or by sequence
# print out common and unique entries
use strict; use warnings;
use Getopt::Long;
use List::MoreUtils qw(any);
# Variables available via command line options
my $file1;
my $file2;
my $match_by_id = 0;
my $match_by_seq = 0;
my $help = 0;
GetOptions ("file1=s" => \$file1, # first fasta file
"file2=s" => \$file2, # second fasta file
"match_by_id" => \$match_by_id, # match fasta sequences by id
"match_by_seq" => \$match_by_seq,
"help|h" => \$help
) # match fasta sequences by id
or die &print_help;
if ( $help == 1 )
{
&print_help;
}
##############################################################################################################
# MAIN BODY
# Hashes to store fasta entries from both files
my %file1_hash = ();
my %file2_hash = ();
# open fasta file
open( my $fh1, "<", $file1 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n");
open( my $fh2, "<", $file2 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n");
# Move fasta sequences from the first file to a hash
my $fasta1_id;
my $seq1;
while( <$fh1> )
{
chomp;
if ( /^>.+/ )
{
$fasta1_id = $_;
$file1_hash{$fasta1_id} = ();
}
elsif ( /\A[ATCGN]+\z/i )
{
$seq1 = $_;
$file1_hash{$fasta1_id} = $seq1;
}
else
{
die ( "Malformed identifier or sequence contains forbidden characters: $_\n" );
}
}
# Move fasta sequences from the second file to a hash
my $fasta2_id;
my $seq2;
while( <$fh2> )
{
chomp;
if ( /^>.+/ )
{
$fasta2_id = $_;
$file2_hash{$fasta2_id} = ();
}
elsif ( /\A^[ATCGN]+\z/i )
{
$seq2 = $_;
$file2_hash{$fasta2_id} = $seq2;
}
else
{
die ( "Malformed identifier or sequence contains forbidden characters: $_\n" );
}
}
if ( $match_by_id == 1 && $match_by_seq == 0 )
{
&run_id_match(\%file1_hash, \%file2_hash);
}
elsif ( $match_by_id == 0 && $match_by_seq == 1 )
{
&run_seq_match(\%file1_hash, \%file2_hash);
}
else
{
print "You have to specify one of the options, either --match_by_id or --match_by_seq\n";
&print_help;
}
#################################################################################################################
# SUBROUTINES
sub print_help
{
print "\n";
print "USAGE: perl match_fasta_files --file1 <file1.fasta> --file2 <file2.fasta> --match_by_seq --match_by_id\n";
print "NOTE: --match_by_seq and --match_by_id flags are mutually exclusive\n";
print "OUTPUT: This script will compare two fasta files either by id or by sequence (these two options are mutually exclusive) and output 3 fasta files:\n";
print "unique_to_file1.fasta - entries unique to file1\n";
print "unique_to_file2.fasta - entries unique to file2\n";
print "common.fasta - present in both files\n";
print "HELP: use -h OR --help to print this message and exit\n";
print "\n";
exit;
}
sub run_id_match
# Iterate over both fasta hashes and
# and find entries (by ids only, sequences can be different) unique to first and second
# fasta file as well as entries common between files
{
my $file1_hash_ref = shift;
my $file2_hash_ref = shift;
my $id;
my $seq;
for ( keys %$file1_hash_ref )
{
if ( exists $file2_hash_ref->{$_} )
{
my $id = $_;
my $seq = $file1_hash_ref->{$_};
&print_to_common($id, $seq);
}
else
{
my $id = $_;
my $seq = $file1_hash_ref->{$_};
&print_to_unique_file1($id, $seq);
}
}
for ( keys %$file2_hash_ref )
{
if ( exists $file1_hash_ref->{$_} )
{
next;
}
else
{
my $id = $_;
my $seq = $file2_hash_ref->{$_};
&print_to_unique_file2($id, $seq);
}
}
}
sub run_seq_match
# Iterate over both fasta hashes and
# and find entries (by sequences only ids can be different) unique to first and second
# fasta file as well as entries common between files
# Common sequence entries will written to file will be taken from file1, only a single
# sequence occurence will printed out, we don't care which one.
{
my $file1_hash_ref = shift;
my $file2_hash_ref = shift;
my $id;
my $seq;
for my $value ( values %$file1_hash_ref )
{
if ( any {$_ eq $value } values %$file2_hash_ref ) # check if any sequences from the file1 are also seen in file2
{
my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref;
$id = $keys[0];
$seq = $value;
&print_to_common($id, $seq);
}
else
{
my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref;
$id = $keys[0];
$seq = $value;
&print_to_unique_file1($id, $seq);
}
}
for my $value ( values %$file2_hash_ref )
{
if ( any {$_ eq $value } values %$file1_hash_ref ) # check if any sequences from the file2 are seen in file1
{
next; # skip if yes, we are only looking for unique sequences in file2
}
else
{
my @keys = grep { $file2_hash_ref->{$_} eq $value } keys %$file2_hash_ref;
$id = $keys[0];
$seq = $value;
&print_to_unique_file2($id, $seq);
}
}
}
sub print_to_common
# Print entries common to both files
{
my $id = shift;
my $seq = shift;
my $common_file_name = "common_to_both_files.fasta";
open( my $common_fh, ">>", $common_file_name ) or die "Cannot create output file for common sequences! $!\n";
print {$common_fh} "$id\n";
print {$common_fh} "$seq\n";
}
sub print_to_unique_file1
# Print entries unique to file1
{
my $id = shift;
my $seq = shift;
my $unique_to_file1_name = join("", "unique_to_", $file1);
open( my $unique_to_file1_fh, ">>", $unique_to_file1_name ) or die "Cannot create output file for unique to file1 sequences! $!\n";
print {$unique_to_file1_fh} "$id\n";
print {$unique_to_file1_fh} "$seq\n";
}
sub print_to_unique_file2
# Print entries unique to file2
{
my $id = shift;
my $seq = shift;
my $unique_to_file2_name = join("", "unique_to_", $file2);
open( my $unique_to_file2_fh, ">>", $unique_to_file2_name ) or die "Cannot create output file for unique to file1 sequences! $!\n";
print {$unique_to_file2_fh} "$id\n";
print {$unique_to_file2_fh} "$seq\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment