Skip to content

Instantly share code, notes, and snippets.

@alexpreynolds
Last active August 29, 2015 13:56
Show Gist options
  • Save alexpreynolds/8945214 to your computer and use it in GitHub Desktop.
Save alexpreynolds/8945214 to your computer and use it in GitHub Desktop.
Consolidating genomic elements by ID relationship
#!/usr/bin/env python
import sys, copy
def sampleInput():
return '''gain_765 chr15 9681969 9685418 chr15 9660912 9712719 loss_1136
gain_766 chr15 9706682 9852347 chr15 9660912 9712719 loss_1136
gain_766 chr15 9706682 9852347 chr15 9765125 9863990 loss_765
gain_780 chr20 9706682 9852347 chr20 9765125 9863990 loss_769
gain_760 chr15 9706682 9852347 chr15 9660912 9712719 loss_1137
gain_760 chr15 9706682 9852347 chr15 9765125 9863990 loss_763'''
class Range:
"""Defines a generic genomic range"""
def __init__(self, chr, start, stop):
self.chr = str(chr)
self.start = int(start)
self.stop = int(stop)
def __str__(self):
return "[" + str(self.chr) + ":" + str(self.start) + "-" + str(self.stop) + "]"
class Entity:
"""Defines an entity"""
def __init__(self, name, range):
self.name = name
self.range = range
def __str__(self):
return self.name + " " + str(self.range)
def main():
ranges = dict()
rels = dict()
for line in sampleInput().split('\n'):
elems = line.split('\t')
enA = Entity(elems[0], Range(elems[1], elems[2], elems[3]))
enB = Entity(elems[7], Range(elems[4], elems[5], elems[6]))
ranges[enA.name] = enA.range
ranges[enB.name] = enB.range
if len(rels) > 0:
if enA.name in rels:
rels[enA.name].add(enB.name)
if enB.name in rels:
rels[enB.name].add(enA.name)
else:
rels[enB.name] = set([enA.name])
elif enB.name in rels:
rels[enB.name].add(enA.name)
if enA.name in rels:
rels[enA.name].add(enB.name)
else:
rels[enA.name] = set([enB.name])
else:
rels[enA.name] = set([enB.name])
rels[enB.name] = set([enA.name])
else:
rels[enA.name] = set([enB.name])
rels[enB.name] = set([enA.name])
for k in rels:
s = rels[k]
ns = copy.deepcopy(s)
for e in s:
if e in rels:
os = rels[e]
ns = ns.union(os)
rels[k] = ns
nrels = dict()
for k in rels:
s = rels[k]
for e in s:
if e != k:
rels[e] = set()
elif len(rels[e]) > 0:
nrels[e] = s
for nk in nrels:
s = nrels[nk]
chr = ""
min = sys.maxint
max = -sys.maxint - 1
for r in s:
chr = ranges[r].chr
if ranges[r].start < min:
min = ranges[r].start
if ranges[r].stop > max:
max = ranges[r].stop
print '\t'.join([chr, str(min), str(max), str(s)])
if __name__ == "__main__":
main()
@alexpreynolds
Copy link
Author

$ ./consolidate.py | sort-bed -
chr15   9660912 9863990 set(['gain_765', 'loss_1136', 'gain_766', 'loss_765'])
chr15   9660912 9863990 set(['loss_1137', 'loss_763', 'gain_760'])
chr20   9706682 9863990 set(['loss_769', 'gain_780'])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment