Last active
July 21, 2020 12:58
-
-
Save meren/82a044bee86b8c5568669695c12575cc to your computer and use it in GitHub Desktop.
An update to Antti Karkman's GFF parser. Find the latest version here: https://github.com/karkman/gff_parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
## Antti Karkman | |
## University of Gothenburg | |
## antti.karkman@gmail.com | |
## 2017 | |
import gffutils | |
import argparse | |
from collections import Counter | |
#parse the arguments | |
parser = argparse.ArgumentParser(description="""Parse Prokka annotated genome/metagenome to add external gene calls and functions to anvi'o. | |
Input annotation in GFF3 format, outputs are tab-delimited text files, one for gene calls and one for annotations""") | |
parser.add_argument('gff_file', metavar='GFF3', help='Annotation file from Prokka in GFF3 format') | |
parser.add_argument('--gene-calls', default='gene_calls.txt', help='Output: External gene calls (Default: gene_calls.txt)') | |
parser.add_argument('--annotation', default='gene_annot.txt', help='Output: Functional annotation for external gene calls (Default: gene_annot.txt)') | |
parser.add_argument('--process-all', default=False, action="store_true", help="Prodigal returns anything it finds, including tRNAs and\ | |
other genetic structures that are not nessecarily translatable, and can cause downstream issues especially if you would like to\ | |
use your annotations in contigs databases for pangenomic analyses. As a precation this script only recovers open reading frames\ | |
identifie dby Prodigal. Using this flag, you can import everything.") | |
args = parser.parse_args() | |
#Input file and output files | |
GFF = args.gff_file | |
OUT_CDS = open(args.gene_calls, 'w') | |
OUT_ANNO = open(args.annotation, 'w') | |
#load in the GFF3 file | |
db = gffutils.create_db(GFF, ':memory:') | |
#Print headers for anvi'o | |
OUT_CDS.write("gene_callers_id\tcontig\tstart\tstop\tdirection\tpartial\tcall_type\tsource\tversion\n") | |
OUT_ANNO.write("gene_callers_id\tsource\taccession\tfunction\te_value\n") | |
#running gene ID and a trumped-up e-value for the gene calls. | |
gene_id = 1 | |
e_value = "0" | |
# keping track of things we haven't processed | |
feature_types = Counter() | |
call_types = Counter() | |
total_num_features = 0 | |
features_missing_product_or_note = 0 | |
#parse the GFF3 file and write results to output files | |
for feature in db.all_features(): | |
total_num_features += 1 | |
# determine source | |
source, version = feature.source.split(':') | |
# move on if not Prodigal, unless the user wants it badly | |
if not args.process_all: | |
if source != 'Prodigal': | |
continue | |
start = feature.start - 1 | |
stop = feature.stop | |
feature_types[feature.featuretype] += 1 | |
if feature.featuretype == 'CDS': | |
call_type = 1 | |
call_types['CDS'] += 1 | |
elif 'RNA' in feature.featuretype: | |
call_type = 2 | |
call_types['RNA'] += 1 | |
else: | |
call_type = 3 | |
call_types['unknown'] += 1 | |
if (float(start - stop)/float(3)).is_integer() == True: | |
partial = str(0) | |
else: | |
partial = str(1) | |
try: | |
gene_acc = feature.attributes['gene'][0] | |
except KeyError: | |
gene_acc = "" | |
# if a feature is missing both, move on. | |
if 'product' not in feature.attributes.keys() and 'note' not in feature.attributes.keys(): | |
features_missing_product_or_note += 1 | |
continue | |
try: | |
product = feature.attributes['product'][0] | |
except KeyError: | |
product = feature.attributes['note'][0] | |
# skip if hypotethical proiten: | |
if product == 'hypothetical protein': | |
product = "" | |
gene_acc = "" | |
# determine direction | |
if feature.featuretype=='repeat_region': | |
direction='f' | |
else: | |
if feature.strand=='+': | |
direction='f' | |
else: | |
direction='r' | |
OUT_CDS.write('%d\t%s\t%d\t%d\t%s\t%s\t%d\t%s\t%s\n' % (gene_id, feature.seqid, start, stop, direction, partial, call_type, source, version)) | |
OUT_ANNO.write('%d\t%s:%s\t%s\t%s\t%s\n' % (gene_id, 'Prokka', source, gene_acc, product, e_value)) | |
gene_id = gene_id + 1 | |
print(f"Done. All {total_num_features} have been processed succesfully. There were {call_types['CDS']} coding " | |
f"sequences, {call_types['RNA']} RNAs, and {call_types['unknown']} unknown features.") | |
if features_missing_product_or_note: | |
print() | |
print(f"Please note that we discarded {features_missing_product_or_note} features described in this file " | |
f"since they did not contain any products or notes :/") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment