Skip to content

Instantly share code, notes, and snippets.

@cbare
Created April 13, 2012 22:11
Show Gist options
  • Save cbare/2380465 to your computer and use it in GitHub Desktop.
Save cbare/2380465 to your computer and use it in GitHub Desktop.
Extract features from GFF
############################################################
## A hacky script to extract features from a GFF3 file ##
## and output them in a format compatible with the ##
## Gaggle genome browser. ##
## ##
## J. Christopher Bare, March 2012 ##
## ##
############################################################
# I got the gene annotations in GFF format from here:
# http://tritrypdb.org/common/downloads/Current_Release/Lbraziliensis/gff/
# http://tritrypdb.org/common/downloads/Current_Release/Lmajor/gff/
# http://tritrypdb.org/common/downloads/Current_Release/Linfantum/gff/
# Description of the GFF format:
# http://gmod.org/wiki/GFF
import sys
import re
filename = sys.argv[1]
counter = {}
features = {}
class Feature:
def __init__(self,sequence,strand,start,end,name,common_name,type,parent_id):
self.sequence=sequence
self.strand=strand
self.start=start
self.end=end
self.name=name
self.common_name=common_name
self.type=type
self.parent_id=parent_id
with open(filename, 'r') as f:
for line in f:
if line.startswith("##FASTA"):
break
if line.startswith('#'):
continue
# process features in gff format
fields = line.split("\t")
type = fields[2]
# count up how many of each type of feature we have
if counter.has_key(type):
counter[type] +=1
else:
counter[type] = 1
# don't need these, throw 'em out
if type=="supercontig":
continue
# parse out the name of the chromosomes
m = re.match("GeneDB\|(.*)",fields[0])
if m:
seq = m.group(1)
else:
raise Exception('Unrecognized sequence format')
start = fields[3]
end = fields[4]
strand = fields[6]
# People always screw up the attributes field which is suppose to look like this:
# key1=value1;key2=value2;key3=value3a,value3b,value3c
attrs_field = fields[8]
# clean up attr_field:
# if there's a semi-colon that doesn't look like it belongs there, remove it.
if ";," in attrs_field:
attrs_field = attrs_field.replace(";,",",")
#print attrs_field
attributes = { k:v for (k,v) in [kvp.split("=") for kvp in attrs_field.split(";")] }
id = attributes['ID']
name = attributes.get('locus_tag',attributes.get('Name',id))
parent_id = attributes.get('Parent')
# grab all the features in case we want to do some post-processing
# features[id] = Feature(seq,strand,start,end,name,None,type, parent_id)
# here we're just interested in the "gene" features - not the redundant stuff
# like rRNA, snRNA, tRNA, transcript, mRNA, CDS, which all have a corresponding
# parent feature which is of type "gene".
# We also ignore exon features, 'cause our gene model is just a block with a start
# and end.
if type=="gene":
if ".tRNA" in name:
type = "trna"
if ".rRNA" in name:
type = "rrna"
if ".ncRNA" in name:
type = "ncrna"
if ".snRNA" in name:
type = "snrna"
if ".snoRNA" in name:
type = "snorna"
if
print "\t".join( [seq,strand,start,end,name,"",type] )
# for key,value in counter.items():
# print key + " => " + str(value)
#
# print "features: " + str(len(features))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment