Skip to content

Instantly share code, notes, and snippets.

@standage
Last active June 8, 2018 15:15
Show Gist options
  • Save standage/5850270 to your computer and use it in GitHub Desktop.
Save standage/5850270 to your computer and use it in GitHub Desktop.
NCBI provides specifications for several Fasta defline identifier formats/conventions at ftp://ftp.ncbi.nih.gov/blast/documents/formatdb.html. This script will automatically detect the convention used sequence-by-sequence, and convert all deflines to the requested format.
#!/usr/bin/env perl
# Copyright (c) 2012-2013, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
use strict;
use Getopt::Long;
# Program usage statement
sub print_usage
{
my $OUT = shift(@_);
print $OUT "Usage: $0 [options] sequence.fa
Options:
-h|--help print this help message and exit
--outfile print sequences with converted deflines to the specified file;
default is the terminal (stdout)
--outfmt defline format to use when printing validated sequences; valid
options include the following: 'idonly', 'General', and
'Local'; default is 'idonly'
--db the 'General' defline format includes a database name in the
second field; specify the value to be used when printing
validated sequences in 'General' format
";
}
# NCBI BLAST provides the reference list of valid defline patterns: see "FASTA
# Defline Format" at ftp://ftp.ncbi.nih.gov/blast/documents/formatdb.html.
# We support a subset of these defline patterns.
my $deflines =
{
'gb\|([^\|]+)\|(\S*)' => "GenBank",
'emb\|([^\|]+)\|(\S*)' => "EMBL",
'dbj\|([^\|]+)\|(\S*)' => "DDBJ",
'sp\|([^\|]+)\|(\S*)' => "SWISS-PROT",
'pdb\|([^\|]+)\|(\S*)' => "PDB",
'gnl\|[^\|]+\|(\S+)' => "General",
'ref\|([^\|]+)\|(\S*)' => "RefSeq",
'lcl\|(\S+)' => "Local",
'([^\|\s]+)' => "idonly",
};
my $defline_outformats =
{
"General" => ">gnl|%s|%s\n",
"Local" => ">lcl|%s\n",
"idonly" => ">%s\n",
};
my $idsfound = {};
# Parse command line options
my $outfmt = "idonly";
my $db = "";
my $outfilename = "";
GetOptions
(
"h|help" => sub { print_usage(\*STDOUT); exit(0); },
"outfmt=s" => \$outfmt,
"db=s" => \$db,
"outfile=s" => \$outfilename,
);
# Create output stream
my $OUT = \*STDOUT;
if($outfilename ne "")
{
open($OUT, ">", $outfilename);
unless($OUT)
{
printf("error: unable to open output file '%s'\n", $outfilename);
exit(1);
}
}
# Verify output format
if(defined $defline_outformats->{$outfmt})
{
if($outfmt eq "General" and $db eq "")
{
printf(STDERR "error: must provide a db value when using output format ".
"'%s'\n", $outfmt);
print_usage(\*STDERR);
exit(1);
}
if($outfmt ne "General" and $db ne "")
{
printf(STDERR "warning: database '%s' ignored when using output format ".
"'%s'\n", $outfmt);
}
}
else
{
printf(STDERR "error: unsupported output format '%s'\n", $outfmt);
print_usage(\*STDERR);
exit(1);
}
# Verify input file
my $infile = shift(@ARGV);
unless($infile)
{
print(STDERR "error: must provide an input sequence file\n");
print_usage(\*STDERR);
exit(1);
}
open(my $IN, "<", $infile);
unless($IN)
{
printf(STDERR "error: could not open input sequence file '%s'\n", $infile);
exit(1);
}
# Validate deflines, and print if outfile is indicated
my $counts = {};
foreach my $format_name(keys(%$defline_outformats))
{
$counts->{ $format_name } = 0;
}
while(my $line = <$IN>)
{
chomp($line);
if($line =~ m/^>/)
{
my $gi;
my $idacc;
my $locus;
foreach my $defline_format(keys(%$deflines))
{
my $format_name = $deflines->{$defline_format};
if($line =~ m/^>$defline_format/)
{
$idacc = $1;
$locus = $2;
$counts->{ $format_name } += 1;
if($idsfound->{$idacc})
{
printf(STDERR "warning: found duplicated sequence ID '%s'\n", $idacc);
}
$idsfound->{$idacc} = 1;
last;
}
elsif($line =~ m/^>gi\|(\d+)\|$defline_format/)
{
$gi = $1;
$idacc = $2;
$locus = $3;
$counts->{ $format_name } += 1;
if($idsfound->{$gi})
{
printf(STDERR "warning: found duplicated sequence ID '%s'\n", $gi);
}
$idsfound->{$gi} = 1;
last;
}
}
if($idacc)
{
my $replace = $idacc;
$replace .= " locus=$locus" if($locus);
$replace = "$gi accession=$replace" if($gi);
$replace = "lcl|$replace" if($outfmt eq "Local");
$replace = "gnl|$db|$replace" if($outfmt eq "General");
$line =~ s/>\S+/>$replace/;
}
else
{
printf(STDERR "warning: could not validate defline format: '%s'\n",$line);
}
}
printf($OUT "%s\n", $line);
}
# Print report about validated deflines
while(my($format, $format_name) = each(%$deflines))
{
if($counts->{$format_name} > 0)
{
printf(STDERR "note: validated %d sequences with '%s' defline format\n",
$counts->{$format_name}, $format_name);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment