Skip to content

Instantly share code, notes, and snippets.

@jayhesselberth
Last active August 29, 2015 13:56
Show Gist options
  • Save jayhesselberth/8830436 to your computer and use it in GitHub Desktop.
Save jayhesselberth/8830436 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
''' seqtools-design-barcodes: Design barcoded Illumina adaptors.
Constraints:
1. No homopolyeric runs
2. Minimum GC content is 40%
3. Hamming dist > 1 (for barcodes > 4 nt)
4. At least 3 bases in represented in each bar code
5. Relatively equal base representation at each position for bar
codes in a set
6. No bar codes are substrings of each other at their 5' ends
'''
import sys
import pdb
from itertools import product, combinations
from collections import Counter
from random import sample
__version__ = '$Revision: 357 $'
__author__ = 'Jay Hesselberth'
__contact__ = 'jay.hesselberth@gmail.com'
# barcode
SEQ_FOR_NAME_SPEC='Illumina-BC%d-%s-FOR'
SEQ_FOR_SPEC='ACACTCTTTCCCTACACGACGCTCTTCCGATCT%s*T'
# reverse comp
SEQ_REV_NAME_SPEC='Illumina-BC%d-%s-REV'
SEQ_REV_SPEC='/5phos/%sAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG'
# Truseq
TRUSEQ_FOR_NAME_SPEC='TRU-IDX%d-%s'
TRUSEQ_FOR_SPEC='/5phos/GATCGGAAGAGCACACGTCTGAACTCCAGTCAC%sATCTCGTATGCCGTCTTCTGCTTG'
def design_barcodes(num_barcodes, barcode_lens, min_hamming,
min_base_prop, truseq, verbose):
last_barcodes = []
cur_barcode_num = 1
for barcode_len in sorted(barcode_lens):
if verbose:
print >>sys.stderr, ">> Designing barcodes of length %d" % \
barcode_len
cur_barcodes = generate_barcodes(length=barcode_len)
cur_barcodes = validate_barcodes(cur_barcodes)
if last_barcodes:
cur_barcodes = remove_substring_barcodes(last_barcodes,
cur_barcodes)
satisfied = False
while not satisfied:
if verbose:
print >>sys.stderr, ">> selecting set..."
barcodes_sample = sample(cur_barcodes, num_barcodes)
if not optimal_cycle_content(barcodes_sample, barcode_len,
min_base_prop, verbose):
continue
if not optimal_hamming(barcodes_sample, min_hamming,
barcode_len, verbose):
continue
satisfied = True
barcodes_sample.sort()
for barcode in barcodes_sample:
if truseq:
print '%s\t%s' % (TRUSEQ_FOR_NAME_SPEC % (cur_barcode_num,
barcode),
TRUSEQ_FOR_SPEC % barcode)
else:
print '%s\t%s' % (SEQ_FOR_NAME_SPEC % (cur_barcode_num,
barcode),
SEQ_FOR_SPEC % barcode)
print '%s\t%s' % (SEQ_REV_NAME_SPEC % (cur_barcode_num,
barcode),
SEQ_REV_SPEC % reverse_complement(barcode))
cur_barcode_num += 1
last_barcodes.extend(barcodes_sample)
def reverse_complement(seq):
''' reverse complement sequence '''
basepairs={'A':'T','T':'A','G':'C','C':'G'}
revcomp = []
for char in seq:
revcomp.append(basepairs[char])
revcomp.reverse()
return ''.join(revcomp)
def remove_substring_barcodes(sub_barcodes, barcodes):
''' identify and remove barcodes that have identical bases at their
beginnings '''
validated = []
for barcode in barcodes:
found_sub = False
for sub_barcode in sub_barcodes:
if barcode.startswith(sub_barcode):
found_sub = True
if not found_sub:
validated.append(barcode)
return validated
def optimal_hamming(barcodes, min_hamming, barcode_len, verbose):
''' determine whether any set of 2 barcodes is less than some minimum
hamming distance '''
# hamming dist is too stringent for barcodes <= 5
if barcode_len <= 5: return True
for this, other in combinations(barcodes, r=2):
dist = hamming_dist(this, other)
if dist < min_hamming:
if verbose:
print >>sys.stderr, ">> hamming not satisfied..."
return False
return True
def hamming_dist(this, other):
''' calc hamming distance (i.e. number of characters different between
two strings)'''
assert len(this) == len(other)
dist = 0
for pos, char in enumerate(this):
if other[pos] != char:
dist += 1
return dist
def optimal_cycle_content(barcodes, num_cycles, min_base_prop, verbose):
''' select set with optimal cycle content i.e. proportional nucleotide
content at each position '''
num_barcodes = float(len(barcodes))
for pos in range(num_cycles):
nuc_comps = Counter()
for barcode in barcodes:
nuc = barcode[pos]
nuc_comps[nuc] += 1
vals = nuc_comps.values()
min_val = min(vals)
base_prop = float(min_val) / float(num_barcodes)
if base_prop < min_base_prop:
if verbose:
print >>sys.stderr, ">> cycle content not satisfied..."
return False
return True
def validate_barcodes(barcodes):
''' master func for all checks '''
validated = []
for barcode in barcodes:
if not check_homopolymer(barcode):
continue
if not check_gc_content(barcode):
continue
if not check_composition(barcode):
continue
validated.append(barcode)
return validated
def check_homopolymer(barcode):
''' checks for homopolymer runs in barcode '''
dinucs = set(['AA','CC','TT','GG'])
for idx, nuc in enumerate(barcode):
dinuc = barcode[idx:idx+2]
if dinuc in dinucs:
return False
return True
def check_gc_content(barcode, min_gc=0.4):
''' determines whether gc_content of barcode is less than some minimum
gc content'''
num_gc = float(len([i for i in barcode if i in 'GC']))
gc_cont = num_gc / float(len(barcode))
if gc_cont < min_gc:
return False
return True
def check_composition(barcode, min_nucs=3):
''' determines whether composition i.e. number of bases represented is
less than some minimum number '''
uniq_nucs = dict.fromkeys([i for i in barcode])
num_nucs = len(uniq_nucs.keys())
if num_nucs < min_nucs:
return False
return True
def generate_barcodes(length):
nucs = 'ACTG'
return [''.join(i) for i in product(nucs,repeat=length)]
def parse_options(args):
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-v','--verbose',action='store_true')
parser.add_option('-l','--barcode-length',action='append',
default=[])
parser.add_option('-n','--number-barcodes',action='store',
type='int',
help='number of bar codes (default: %default)',
default=32)
parser.add_option('-t','--truseq',action='store_true',
help='Design nested Truseq adaptors (default: %default)',
default=False)
parser.add_option('-m','--minimum-hamming',action='store',
type='int', dest='min_hamming',
help='minimum hamming distance (default: %default)',
default=1)
parser.add_option('-c','--minimum-base-proportion',action='store',
type='float', dest='min_base_prop',
help='minimum base proportion (default: %default)',
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment