Skip to content

Instantly share code, notes, and snippets.

@pjbriggs
Created April 1, 2015 10:06
Show Gist options
  • Save pjbriggs/6229017c8dba994dd213 to your computer and use it in GitHub Desktop.
Save pjbriggs/6229017c8dba994dd213 to your computer and use it in GitHub Desktop.
Given a GTF file, check that for each exon entry the associated features 'CDS', 'stop_codon' and 'start_codon' are consistent and specify the same range of bases as # their "parent" exon
#!/bin/env python
#
# Check exon coverage in a GTF file
# Peter Briggs UoM, March 2015
#
# Purpose
# -------
# Given a GTF file, check that for each exon entry the
# associated features 'CDS', 'stop_codon' and 'start_codon'
# are consistent and specify the same range of bases as
# their "parent" exon. For example:
# DS499594 protein_coding exon 7780 9931 . - ...
# DS499594 protein_coding CDS 7780 9931 . - ...
# DS499594 protein_coding start_codon 9929 9931 . ...
#
# Assumptions
# -----------
# The program assumes:
# 1. An 'exon' entry always preceeds the entries for the
# associated features
# 2. Only 'protein_coding' exons are of interest
#
# Usage
# -----
# check_gtf_exon_coverage.py FILE.gtf
#
# Outputs
# -------
# For each exon encountered, report the following issues (if
# discovered):
#
# - Multiple features e.g. more than one CDS, stop_codon etc
# appears to be associated with the exon entry
# - Exons where the total range covered by the associated
# features is not the same as the extent of the exon
# - One or more features appear to be outside the exon
#
# If the exon and features are consistent then report nothing.
#
# Setup
# -----
# Needs the bcftbx library from
# https://github.com/fls-bioinformatics-core/genomics
# and the GTFFile and related modules from
# https://github.com/fls-bioinformatics-core/GFFUtils
#
# You can 'git clone' both modules and then add the resulting
# directories to your PYTHONPATH.
#
import sys
import optparse
import GTFFile
class GTFFeature:
# Class for storing information on a generic GTF
# feature
#
# Set the 'feature' property after instantiation
# e.g. f.feature = 'CDS'
def __init__(self,start,end,strand):
self.feature = 'feature'
self.start = start
self.end = end
self.strand = strand
def contains(self,start,end):
# Return True if feature completely contains
# the region specified by 'start' and 'end'
return self.start <= start and self.end >= end
def __repr__(self):
return "%s: %d-%d %s" % (self.feature.upper(),
self.start,
self.end,
self.strand)
class Exon(GTFFeature):
# Class for storing information on an exon
#
# This is a sublass of the GTFFeature class
def __init__(self,*args):
GTFFeature.__init__(self,*args)
self.feature = 'exon'
def check_exon(exon,exon_line,features):
# Perform consistency checks on an exon given a list
# of associated features
#
# Arguments:
# - exon : 'Exon' class instance
# - exon_line: line number in the input GTF where
# the exon is declared
# - features: list of 'GTFFeature' instances for
# each of the associated features
#
# Performs checks and reports to stdout if there are
# any issues.
#
bad_entry = False
# Check there are features
if len(features) == 0:
print "WARNING no features for exon"
bad_entry = True
# Check the features are all inside the exon
for ft in features:
if not exon.contains(ft.start,ft.end):
print "WARNING %s is not inside preceeding exon" % ft.feature
bad_entry = True
# Check for duplicated features
for ft_type in ('CDS','start_codon','stop_codon'):
count = 0
for ft in features:
if ft.feature == ft_type: count += 1
if count > 1:
print "WARNING multiple %ss" % ft_type
bad_entry = True
# Check that the features cover the exon
# Get min and max positions
min_pos = None
max_pos = None
for ft in features:
if min_pos is None:
min_pos = ft.start
if max_pos is None:
max_pos = ft.end
min_pos = min(min_pos,ft.start)
max_pos = max(max_pos,ft.end)
if min_pos != exon.start or max_pos != exon.end:
print "WARNING features don't cover the preceeding exon"
bad_entry = True
# Report "bad" entry
if bad_entry:
print "%s (L%d)" % (exon,exon_line)
if min_pos is not None and max_pos is not None:
print "Features cover %d-%d" % (min_pos,max_pos)
if len(features):
print "List of associated features:"
for ft in features:
print "- %s" % ft
print "-"*60
if __name__ == '__main__':
# Handle command line
p = optparse.OptionParser(usage="%prog GTFFILE",
description="Check CDS and start/stop_exon entries "
"span exon entries")
opts,args = p.parse_args()
if len(args) != 1:
p.error("Need to supply GTF file")
# Process GTF file
exon = None
exon_line = None
features = []
for line in GTFFile.GTFIterator(args[0]):
# For each line in file:
# - when a new exon is encountered
# * process any preceeding exon data and report, then discard
# * store the new exon data
# - for other features, add these to the list of features
# associated with that exon
#print str(line)
if line['source'] != 'protein_coding':
continue
if line['feature'] in ('CDS','start_codon','stop_codon'):
if exon is None:
raise Exception("No exon defined!")
ft = GTFFeature(line['start'],line['end'],line['strand'])
ft.feature = line['feature']
features.append(ft)
#print "- %s: %d-%d %s" % (f.feature,f.start,f.end,f.strand)
if line['feature'] == 'exon':
if exon is not None:
check_exon(exon,exon_line,features)
# Reset for next exon
exon = Exon(line['start'],line['end'],line['strand'])
exon_line = line.lineno()
features = []
#print exon
# Check last exon
check_exon(exon,exon_line,features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment