Skip to content

Instantly share code, notes, and snippets.

@standage
Last active August 29, 2015 14:00
Show Gist options
  • Save standage/11377530 to your computer and use it in GitHub Desktop.
Save standage/11377530 to your computer and use it in GitHub Desktop.
Find overlapping genes in a GTF file
#!/usr/bin/env python
import re, sys
# vrlpr.py: find overlapping genes in a GTF file
# Usage: python vrlpr.py < genes.gtf > overlaps.txt
def overlap(range1, range2):
return range1[0] == range2[0] and range1[2] >= range2[1] and \
range1[1] <= range2[2]
genes = {}
for line in sys.stdin:
if line == "" or line.startswith("#"):
continue
line = line.rstrip()
fields = line.split("\t")
seqid = fields[0]
start = int(fields[3])
end = int(fields[4])
attributes = fields[8]
matches = re.match('gene_id "([^"]+)"', attributes)
if matches:
geneid = matches.group(1)
if geneid not in genes:
genes[geneid] = [seqid, start, end]
else:
assert seqid == genes[geneid][0]
if start < genes[geneid][1]:
genes[geneid][1] = start
if end > genes[geneid][2]:
genes[geneid][2] = end
for i in range(len(genes)):
for j in range(i+1, len(genes)):
geneid_i = genes.keys()[i]
geneid_j = genes.keys()[j]
range_i = genes[geneid_i]
range_j = genes[geneid_j]
if overlap(range_i, range_j):
print "%s\t%s" % (geneid_i, geneid_j)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment