Skip to content

Instantly share code, notes, and snippets.

@ccwang002
Created February 13, 2019 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ccwang002/c6fca09bf010bbc20060c194a56697b7 to your computer and use it in GitHub Desktop.
Save ccwang002/c6fca09bf010bbc20060c194a56697b7 to your computer and use it in GitHub Desktop.
from itertools import combinations, product
def gen_pos_sets_to_sub(barcode, max_sub=1):
"""
Generate all the possible position combinations (sets) within
the given maximal number of substitutions.
Examples:
>>> list(gen_pos_sets_to_sub('ATG', 1))
[(0,), (1,), (2,)]
>>> list(gen_pos_sets_to_sub('ATG', 2))
[(0,), (1,), (2,), (0, 1), (0, 2), (1, 2)]
"""
assert max_sub >= 0
all_pos = range(len(barcode))
for num_sub in range(1, max_sub + 1):
yield from combinations(all_pos, num_sub)
def sub_barcode(barcode, pos_set):
"""
Generate all possible barcode sequences with the substitutions
at the given set of indices.
Examples:
>>> list(sub_barcode('AT', (0,)))
['TT', 'CT', 'GT']
>>> list(sub_barcode('AT', (0, 1)))
['TA', 'TC', 'TG', 'CA', 'CC', 'CG', 'GA', 'GC', 'GG']
"""
# Convert the barcode string into a list of nts
barcode = list(barcode)
# Make the possible nucleotide per position
possible_nts_per_pos = [
# Exclude the original nucleotide
[nt for nt in 'ATCG' if nt != barcode[pos]]
for pos in pos_set
]
# The first for loop assign the new nucleotide
# for each position to be replaced
for new_nts in product(*possible_nts_per_pos):
# Copy the original barcode (list of nts)
barcode_with_sub = barcode[:]
# Replace the barcode sequence with the new nucleotides
# at the given corresponding positions
for ix, nt in zip(pos_set, new_nts):
barcode_with_sub[ix] = nt
# Combine the new barcode into a single string
yield ''.join(barcode_with_sub)
def gen_all_barcodes_with_sub(barcode, max_sub=1):
"""
Generate all possible barcode sequences within the given number of
substitutions.
Examples:
>>> list(gen_all_barcodes_with_sub('AT', 1))
['TT', 'CT', 'GT', 'AA', 'AC', 'AG']
A more realistic example using a longer barcode:
>>> barcode = 'ATATCGATCG'; len(barcode)
10
Count the number of all possible barcodes with <= 2 substitutions
>>> len(list(gen_all_barcodes_with_sub(barcode, 2)))
435
Calculate the theoretical number of barcodes
>>> 10 * (10 - 1) // 2 * ((4 - 1)**2) + len(barcode) * (4 - 1)
435
>>> len(list(gen_all_barcodes_with_sub(barcode, len(barcode)))) == 4 ** len(barcode) - 1
True
"""
barcode = barcode.upper()
assert all(nt in 'ATCG' for nt in barcode)
for pos_set in gen_pos_sets_to_sub(barcode, max_sub):
yield from sub_barcode(barcode, pos_set)
if __name__ == '__main__':
# Run all the examples
import doctest
doctest.testmod(verbose=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment