Created
April 26, 2016 06:30
-
-
Save slavailn/0428a746b1cbe0cc8cd3a67c2746321b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
# Match 2 fasta files by id or by sequence | |
# print out common and unique entries | |
use strict; use warnings; | |
use Getopt::Long; | |
use List::MoreUtils qw(any); | |
# Variables available via command line options | |
my $file1; | |
my $file2; | |
my $match_by_id = 0; | |
my $match_by_seq = 0; | |
my $help = 0; | |
GetOptions ("file1=s" => \$file1, # first fasta file | |
"file2=s" => \$file2, # second fasta file | |
"match_by_id" => \$match_by_id, # match fasta sequences by id | |
"match_by_seq" => \$match_by_seq, | |
"help|h" => \$help | |
) # match fasta sequences by id | |
or die &print_help; | |
if ( $help == 1 ) | |
{ | |
&print_help; | |
} | |
############################################################################################################## | |
# MAIN BODY | |
# Hashes to store fasta entries from both files | |
my %file1_hash = (); | |
my %file2_hash = (); | |
# open fasta file | |
open( my $fh1, "<", $file1 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n"); | |
open( my $fh2, "<", $file2 ) or die ("Cannot open one of the fasta files use --help to see help message: $!\n"); | |
# Move fasta sequences from the first file to a hash | |
my $fasta1_id; | |
my $seq1; | |
while( <$fh1> ) | |
{ | |
chomp; | |
if ( /^>.+/ ) | |
{ | |
$fasta1_id = $_; | |
$file1_hash{$fasta1_id} = (); | |
} | |
elsif ( /\A[ATCGN]+\z/i ) | |
{ | |
$seq1 = $_; | |
$file1_hash{$fasta1_id} = $seq1; | |
} | |
else | |
{ | |
die ( "Malformed identifier or sequence contains forbidden characters: $_\n" ); | |
} | |
} | |
# Move fasta sequences from the second file to a hash | |
my $fasta2_id; | |
my $seq2; | |
while( <$fh2> ) | |
{ | |
chomp; | |
if ( /^>.+/ ) | |
{ | |
$fasta2_id = $_; | |
$file2_hash{$fasta2_id} = (); | |
} | |
elsif ( /\A^[ATCGN]+\z/i ) | |
{ | |
$seq2 = $_; | |
$file2_hash{$fasta2_id} = $seq2; | |
} | |
else | |
{ | |
die ( "Malformed identifier or sequence contains forbidden characters: $_\n" ); | |
} | |
} | |
if ( $match_by_id == 1 && $match_by_seq == 0 ) | |
{ | |
&run_id_match(\%file1_hash, \%file2_hash); | |
} | |
elsif ( $match_by_id == 0 && $match_by_seq == 1 ) | |
{ | |
&run_seq_match(\%file1_hash, \%file2_hash); | |
} | |
else | |
{ | |
print "You have to specify one of the options, either --match_by_id or --match_by_seq\n"; | |
&print_help; | |
} | |
################################################################################################################# | |
# SUBROUTINES | |
sub print_help | |
{ | |
print "\n"; | |
print "USAGE: perl match_fasta_files --file1 <file1.fasta> --file2 <file2.fasta> --match_by_seq --match_by_id\n"; | |
print "NOTE: --match_by_seq and --match_by_id flags are mutually exclusive\n"; | |
print "OUTPUT: This script will compare two fasta files either by id or by sequence (these two options are mutually exclusive) and output 3 fasta files:\n"; | |
print "unique_to_file1.fasta - entries unique to file1\n"; | |
print "unique_to_file2.fasta - entries unique to file2\n"; | |
print "common.fasta - present in both files\n"; | |
print "HELP: use -h OR --help to print this message and exit\n"; | |
print "\n"; | |
exit; | |
} | |
sub run_id_match | |
# Iterate over both fasta hashes and | |
# and find entries (by ids only, sequences can be different) unique to first and second | |
# fasta file as well as entries common between files | |
{ | |
my $file1_hash_ref = shift; | |
my $file2_hash_ref = shift; | |
my $id; | |
my $seq; | |
for ( keys %$file1_hash_ref ) | |
{ | |
if ( exists $file2_hash_ref->{$_} ) | |
{ | |
my $id = $_; | |
my $seq = $file1_hash_ref->{$_}; | |
&print_to_common($id, $seq); | |
} | |
else | |
{ | |
my $id = $_; | |
my $seq = $file1_hash_ref->{$_}; | |
&print_to_unique_file1($id, $seq); | |
} | |
} | |
for ( keys %$file2_hash_ref ) | |
{ | |
if ( exists $file1_hash_ref->{$_} ) | |
{ | |
next; | |
} | |
else | |
{ | |
my $id = $_; | |
my $seq = $file2_hash_ref->{$_}; | |
&print_to_unique_file2($id, $seq); | |
} | |
} | |
} | |
sub run_seq_match | |
# Iterate over both fasta hashes and | |
# and find entries (by sequences only ids can be different) unique to first and second | |
# fasta file as well as entries common between files | |
# Common sequence entries will written to file will be taken from file1, only a single | |
# sequence occurence will printed out, we don't care which one. | |
{ | |
my $file1_hash_ref = shift; | |
my $file2_hash_ref = shift; | |
my $id; | |
my $seq; | |
for my $value ( values %$file1_hash_ref ) | |
{ | |
if ( any {$_ eq $value } values %$file2_hash_ref ) # check if any sequences from the file1 are also seen in file2 | |
{ | |
my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref; | |
$id = $keys[0]; | |
$seq = $value; | |
&print_to_common($id, $seq); | |
} | |
else | |
{ | |
my @keys = grep { $file1_hash_ref->{$_} eq $value } keys %$file1_hash_ref; | |
$id = $keys[0]; | |
$seq = $value; | |
&print_to_unique_file1($id, $seq); | |
} | |
} | |
for my $value ( values %$file2_hash_ref ) | |
{ | |
if ( any {$_ eq $value } values %$file1_hash_ref ) # check if any sequences from the file2 are seen in file1 | |
{ | |
next; # skip if yes, we are only looking for unique sequences in file2 | |
} | |
else | |
{ | |
my @keys = grep { $file2_hash_ref->{$_} eq $value } keys %$file2_hash_ref; | |
$id = $keys[0]; | |
$seq = $value; | |
&print_to_unique_file2($id, $seq); | |
} | |
} | |
} | |
sub print_to_common | |
# Print entries common to both files | |
{ | |
my $id = shift; | |
my $seq = shift; | |
my $common_file_name = "common_to_both_files.fasta"; | |
open( my $common_fh, ">>", $common_file_name ) or die "Cannot create output file for common sequences! $!\n"; | |
print {$common_fh} "$id\n"; | |
print {$common_fh} "$seq\n"; | |
} | |
sub print_to_unique_file1 | |
# Print entries unique to file1 | |
{ | |
my $id = shift; | |
my $seq = shift; | |
my $unique_to_file1_name = join("", "unique_to_", $file1); | |
open( my $unique_to_file1_fh, ">>", $unique_to_file1_name ) or die "Cannot create output file for unique to file1 sequences! $!\n"; | |
print {$unique_to_file1_fh} "$id\n"; | |
print {$unique_to_file1_fh} "$seq\n"; | |
} | |
sub print_to_unique_file2 | |
# Print entries unique to file2 | |
{ | |
my $id = shift; | |
my $seq = shift; | |
my $unique_to_file2_name = join("", "unique_to_", $file2); | |
open( my $unique_to_file2_fh, ">>", $unique_to_file2_name ) or die "Cannot create output file for unique to file1 sequences! $!\n"; | |
print {$unique_to_file2_fh} "$id\n"; | |
print {$unique_to_file2_fh} "$seq\n"; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment