Skip to content

Instantly share code, notes, and snippets.

@wckdouglas
Last active October 24, 2017 14:36
Show Gist options
  • Save wckdouglas/5d4322b8e779f8f55431bdd8372e36e8 to your computer and use it in GitHub Desktop.
Save wckdouglas/5d4322b8e779f8f55431bdd8372e36e8 to your computer and use it in GitHub Desktop.
chr1 566309 566361 - GTGATT
chr1 568081 568136 + TTGAAA,TTGAAA,TTGAAA
chr1 16840712 16840773 - TACTTA
chr1 17067025 17222642 + TGGCAG
chr1 45196732 45196802 - TCGCCT
chr1 161424755 161432231 - GCGTTG
chr1 161493635 161493697 - TGGTGG
chr1 173834762 173834818 - TTGTAA
chr1 173835773 173835846 - TCCACA
chr1 204475714 204475737 + TCAAGT
#!/usr/bin/env python
from __future__ import print_function
from itertools import combinations
from functools import partial
from collections import Counter
from networkx import Graph, connected_components
import sys
import argparse
from numba import jit
def get_opt():
help_message = 'Demultiplexing UMI bed file with the follwing columns:\n'+\
'1. chrom name\n2. start\n3. end\n4. strand\n5. collapsed barcodes delim by ","\n'+\
'This file can be generated by the command: \n'+\
' datamash -g 1,2,3,6 collapse 4 \n'+\
'with barcode at column $I of a $BED_FILE \n'+\
'The program internally used hamming distance matrix of the '+\
'barcodes to generate connected network and cluster them'
parser = argparse.ArgumentParser(description=help_message)
parser.add_argument('-i','--infile',
help ='input bedfile, can be "-" or "/dev/stdin" for stdin (default: -).' +\
'Format should be: ' + \
'\n1. chrom name\n2. start\n3. end\n4. strand\n5. collapsed barcodes delim by ","',
default = '-')
parser.add_argument('-t','--threshold',
help='How many error between barcodes can be tolerated? (default = 1)',
type=int, default = 1)
args = parser.parse_args()
return args
@jit(nopython=True)
def hamming_barcode(barcode_pair):
a, b = barcode_pair
assert len(a) == len(b), 'Wrong barcode extraction'
hd = 0
for i, j in zip(a, b):
if i != j:
hd += 1
return hd
@jit()
def make_graph(comparison, threshold):
'''
Using a graph to connect all umi with <= threshold mismatches
'''
G = Graph()
for pair in comparison:
if hamming_barcode(pair) <= threshold:
G.add_edge(pair[0],pair[1])
else:
G.add_node(pair[0])
G.add_node(pair[1])
return G
@jit()
def unique_barcode_from_graph(graph, barcodes):
'''
'''
unique_barcode = []
for subgraph in connected_components(graph):
subgraph = list(subgraph)
if len(subgraph) == 1:
barcode_id = subgraph[0]
member_counts = barcodes[barcode_id]
barcode_id = barcode_id + '_' + str(member_counts) + '_members'
else:
member_counts = sum(barcodes[bc] for bc in subgraph)
barcode_id = subgraph[0] + '_' + str(member_counts) + '_members'
unique_barcode.append(barcode_id)
return unique_barcode
def demultiplex(barcodes, threshold):
comparison = combinations(barcodes.keys(),r=2)
graph = make_graph(comparison, threshold)
unique_barcode = unique_barcode_from_graph(graph, barcodes)
#print 'In: %i, out: %i' %(len(barcodes), len(unique_barcode))
return unique_barcode
@jit()
def make_line(chrom, start, end, strand, barcode):
'''
Generate output bed line
'''
length = long(end) - long(start)
template = '{chrom}\t{start}\t{end}\t{barcode}\t{length}\t{strand}' \
.format(chrom = chrom,
start = start,
end = end,
barcode = barcode,
length = length,
strand= strand)
print(template, file=sys.stdout)
return 0
@jit()
def file_processing(file_handle, threshold):
'''
parsing bed line and extract distinct barcode, count the duplicates and out put bed line
'''
out_count = 0
for i, line in enumerate(file_handle):
line = line.rstrip('\n')
fields = line.split('\t')
barcodes = fields[-1]
template = partial(make_line, fields[0], fields[1], fields[2], fields[3])
if ',' not in barcodes:
barcode_id = barcodes + '_1_members'
template(barcode_id)
out_count += 1
else:
barcodes = barcodes.split(',')
barcodes_counter = Counter(barcodes)
if len(barcodes_counter.keys()) == 1:
barcode_id = barcodes[0] + '_' + str(barcodes_counter[barcodes[0]]) + '_members'
template(barcode_id)
out_count += 1
else:
barcode_ids = demultiplex(barcodes_counter, threshold)
for barcode_id in barcode_ids:
template(barcode_id)
out_count += 1
if i % 10000000 == 0:
print('Parsed %i lines' %(i), file=sys.stderr)
print('Ouput: %i lines' %(out_count), file=sys.stderr)
return 0
def main():
args = get_opt()
filename = args.infile
threshold = args.threshold
if filename != '-' and filename != '/dev/stdin':
file_handle = open(filename, 'r')
sys.stderr.write('Using file %s\n' %filename)
else:
file_handle = sys.stdin
print('Using stdin...', file=sys.stderr)
file_processing(file_handle, threshold)
return 0
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment