Skip to content

Instantly share code, notes, and snippets.

@daler
Created November 1, 2016 18:55
Show Gist options
  • Save daler/24227fed6f5fbcfbdd29646a653a9cca to your computer and use it in GitHub Desktop.
Save daler/24227fed6f5fbcfbdd29646a653a9cca to your computer and use it in GitHub Desktop.
toy examples to show how multiintersect works
"""
Run the script to see example output.
Modify the features in a, b, and c to see how they affect the output
pybedtools.contrib.plotting.BinaryHeatmap uses the "cluster" method,
but unclustered and naive merge versions are shown as well.
"""
import numpy as np
import pybedtools
def ascii_bed(x, lim):
"""
Prints a single-line ASCII representation of a BED file, replacing the
coordinates with the one-character name of each feature.
"""
first = False
y = x.sort()
a = np.zeros((lim,), dtype=str)
a[:] = '.'
for i in y:
a[i.start:i.stop] = i.name
return ''.join(list(a))
# Play around with the coordinates of a, b, and c.
#
# The code() function below expects the names to be "a", "b", or "c".
a = pybedtools.BedTool(
"""
chr1 6 12 a
chr1 14 20 a
chr1 22 30 a
""", from_string=True)
b = pybedtools.BedTool(
"""
chr1 14 32 b
""", from_string=True)
c = pybedtools.BedTool(
"""
chr1 10 13 c
""", from_string=True)
unclustered = pybedtools.BedTool().multi_intersect(
i=[a.fn, b.fn, c.fn])
clustered = pybedtools.BedTool().multi_intersect(
i=[a.fn, b.fn, c.fn], cluster=True)
def code(n):
"""
Get a unique code for each combination of characters so the combinatorial
binding can be represented by a single character.
e.g.,
0 = no feature
1 = a
3 = b
4 = a+b
5 = c
6 = c+a
8 = b+c
9 = a+b+c
"""
i = 0
if 'a' in n:
i += 1
if 'b' in n:
i += 3
if 'c' in n:
i += 5
return i
def cluster_rename(x):
d = {'1': 'a', '2':'b', '3':'c'}
letters = [d[i] for i in x[4].split(',')]
x.name = str(code(letters))
return x
def fix(x):
"""
After merging distinct, split out the letters into a code and set that code
as the feature's name.
"""
n = x.name.split(',')
x.name = str(code(n))
return x
# An alternative strategy: concatenate all files, merge them together keeping
# track of the sets of names, and convert to codes (see the code() function).
f = a.cat(b, c, postmerge=False).sort().merge(c=4, o='distinct')
print("a:\n{}".format(a))
print("b:\n{}".format(b))
print("c:\n{}".format(c))
print("d (multiintersect; cluster=False):\n{}".format(unclustered))
print("e (multiintersect; cluster=True):\n{}".format(clustered))
print("f (cat-and-merge):\n{}".format(f))
number_line = '01234567890123456789012345678901234567890'
lim = max(max(i.stop for i in x) for x in [a,b,c])
print(number_line)
print('ASCII version of a, b, c:\n' + '\n'.join([ascii_bed(i, lim) for i in [a, b, c]]))
print("\nclustered:")
print(ascii_bed(clustered.each(cluster_rename).saveas(), lim))
print("\nunclustered:")
print(ascii_bed(unclustered.each(cluster_rename).saveas(), lim))
print('\nmerged:')
print(ascii_bed(f.each(fix).saveas(), lim))
print('key:')
print('\n'.join('{0:>5}: {1}'.format(i, code(i)) for i in ['a', 'b', 'c', 'ab', 'ac', 'bc', 'abc']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment