Skip to content

Instantly share code, notes, and snippets.

@meren
Last active July 21, 2020 12:58
Show Gist options
  • Save meren/82a044bee86b8c5568669695c12575cc to your computer and use it in GitHub Desktop.
Save meren/82a044bee86b8c5568669695c12575cc to your computer and use it in GitHub Desktop.
An update to Antti Karkman's GFF parser. Find the latest version here: https://github.com/karkman/gff_parser
#!/usr/bin/env python
## Antti Karkman
## University of Gothenburg
## antti.karkman@gmail.com
## 2017
import gffutils
import argparse
from collections import Counter
#parse the arguments
parser = argparse.ArgumentParser(description="""Parse Prokka annotated genome/metagenome to add external gene calls and functions to anvi'o.
Input annotation in GFF3 format, outputs are tab-delimited text files, one for gene calls and one for annotations""")
parser.add_argument('gff_file', metavar='GFF3', help='Annotation file from Prokka in GFF3 format')
parser.add_argument('--gene-calls', default='gene_calls.txt', help='Output: External gene calls (Default: gene_calls.txt)')
parser.add_argument('--annotation', default='gene_annot.txt', help='Output: Functional annotation for external gene calls (Default: gene_annot.txt)')
parser.add_argument('--process-all', default=False, action="store_true", help="Prodigal returns anything it finds, including tRNAs and\
other genetic structures that are not nessecarily translatable, and can cause downstream issues especially if you would like to\
use your annotations in contigs databases for pangenomic analyses. As a precation this script only recovers open reading frames\
identifie dby Prodigal. Using this flag, you can import everything.")
args = parser.parse_args()
#Input file and output files
GFF = args.gff_file
OUT_CDS = open(args.gene_calls, 'w')
OUT_ANNO = open(args.annotation, 'w')
#load in the GFF3 file
db = gffutils.create_db(GFF, ':memory:')
#Print headers for anvi'o
OUT_CDS.write("gene_callers_id\tcontig\tstart\tstop\tdirection\tpartial\tcall_type\tsource\tversion\n")
OUT_ANNO.write("gene_callers_id\tsource\taccession\tfunction\te_value\n")
#running gene ID and a trumped-up e-value for the gene calls.
gene_id = 1
e_value = "0"
# keping track of things we haven't processed
feature_types = Counter()
call_types = Counter()
total_num_features = 0
features_missing_product_or_note = 0
#parse the GFF3 file and write results to output files
for feature in db.all_features():
total_num_features += 1
# determine source
source, version = feature.source.split(':')
# move on if not Prodigal, unless the user wants it badly
if not args.process_all:
if source != 'Prodigal':
continue
start = feature.start - 1
stop = feature.stop
feature_types[feature.featuretype] += 1
if feature.featuretype == 'CDS':
call_type = 1
call_types['CDS'] += 1
elif 'RNA' in feature.featuretype:
call_type = 2
call_types['RNA'] += 1
else:
call_type = 3
call_types['unknown'] += 1
if (float(start - stop)/float(3)).is_integer() == True:
partial = str(0)
else:
partial = str(1)
try:
gene_acc = feature.attributes['gene'][0]
except KeyError:
gene_acc = ""
# if a feature is missing both, move on.
if 'product' not in feature.attributes.keys() and 'note' not in feature.attributes.keys():
features_missing_product_or_note += 1
continue
try:
product = feature.attributes['product'][0]
except KeyError:
product = feature.attributes['note'][0]
# skip if hypotethical proiten:
if product == 'hypothetical protein':
product = ""
gene_acc = ""
# determine direction
if feature.featuretype=='repeat_region':
direction='f'
else:
if feature.strand=='+':
direction='f'
else:
direction='r'
OUT_CDS.write('%d\t%s\t%d\t%d\t%s\t%s\t%d\t%s\t%s\n' % (gene_id, feature.seqid, start, stop, direction, partial, call_type, source, version))
OUT_ANNO.write('%d\t%s:%s\t%s\t%s\t%s\n' % (gene_id, 'Prokka', source, gene_acc, product, e_value))
gene_id = gene_id + 1
print(f"Done. All {total_num_features} have been processed succesfully. There were {call_types['CDS']} coding "
f"sequences, {call_types['RNA']} RNAs, and {call_types['unknown']} unknown features.")
if features_missing_product_or_note:
print()
print(f"Please note that we discarded {features_missing_product_or_note} features described in this file "
f"since they did not contain any products or notes :/")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment