Last active
August 29, 2015 13:56
-
-
Save jayhesselberth/8830436 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
''' seqtools-design-barcodes: Design barcoded Illumina adaptors. | |
Constraints: | |
1. No homopolyeric runs | |
2. Minimum GC content is 40% | |
3. Hamming dist > 1 (for barcodes > 4 nt) | |
4. At least 3 bases in represented in each bar code | |
5. Relatively equal base representation at each position for bar | |
codes in a set | |
6. No bar codes are substrings of each other at their 5' ends | |
''' | |
import sys | |
import pdb | |
from itertools import product, combinations | |
from collections import Counter | |
from random import sample | |
__version__ = '$Revision: 357 $' | |
__author__ = 'Jay Hesselberth' | |
__contact__ = 'jay.hesselberth@gmail.com' | |
# barcode | |
SEQ_FOR_NAME_SPEC='Illumina-BC%d-%s-FOR' | |
SEQ_FOR_SPEC='ACACTCTTTCCCTACACGACGCTCTTCCGATCT%s*T' | |
# reverse comp | |
SEQ_REV_NAME_SPEC='Illumina-BC%d-%s-REV' | |
SEQ_REV_SPEC='/5phos/%sAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG' | |
# Truseq | |
TRUSEQ_FOR_NAME_SPEC='TRU-IDX%d-%s' | |
TRUSEQ_FOR_SPEC='/5phos/GATCGGAAGAGCACACGTCTGAACTCCAGTCAC%sATCTCGTATGCCGTCTTCTGCTTG' | |
def design_barcodes(num_barcodes, barcode_lens, min_hamming, | |
min_base_prop, truseq, verbose): | |
last_barcodes = [] | |
cur_barcode_num = 1 | |
for barcode_len in sorted(barcode_lens): | |
if verbose: | |
print >>sys.stderr, ">> Designing barcodes of length %d" % \ | |
barcode_len | |
cur_barcodes = generate_barcodes(length=barcode_len) | |
cur_barcodes = validate_barcodes(cur_barcodes) | |
if last_barcodes: | |
cur_barcodes = remove_substring_barcodes(last_barcodes, | |
cur_barcodes) | |
satisfied = False | |
while not satisfied: | |
if verbose: | |
print >>sys.stderr, ">> selecting set..." | |
barcodes_sample = sample(cur_barcodes, num_barcodes) | |
if not optimal_cycle_content(barcodes_sample, barcode_len, | |
min_base_prop, verbose): | |
continue | |
if not optimal_hamming(barcodes_sample, min_hamming, | |
barcode_len, verbose): | |
continue | |
satisfied = True | |
barcodes_sample.sort() | |
for barcode in barcodes_sample: | |
if truseq: | |
print '%s\t%s' % (TRUSEQ_FOR_NAME_SPEC % (cur_barcode_num, | |
barcode), | |
TRUSEQ_FOR_SPEC % barcode) | |
else: | |
print '%s\t%s' % (SEQ_FOR_NAME_SPEC % (cur_barcode_num, | |
barcode), | |
SEQ_FOR_SPEC % barcode) | |
print '%s\t%s' % (SEQ_REV_NAME_SPEC % (cur_barcode_num, | |
barcode), | |
SEQ_REV_SPEC % reverse_complement(barcode)) | |
cur_barcode_num += 1 | |
last_barcodes.extend(barcodes_sample) | |
def reverse_complement(seq): | |
''' reverse complement sequence ''' | |
basepairs={'A':'T','T':'A','G':'C','C':'G'} | |
revcomp = [] | |
for char in seq: | |
revcomp.append(basepairs[char]) | |
revcomp.reverse() | |
return ''.join(revcomp) | |
def remove_substring_barcodes(sub_barcodes, barcodes): | |
''' identify and remove barcodes that have identical bases at their | |
beginnings ''' | |
validated = [] | |
for barcode in barcodes: | |
found_sub = False | |
for sub_barcode in sub_barcodes: | |
if barcode.startswith(sub_barcode): | |
found_sub = True | |
if not found_sub: | |
validated.append(barcode) | |
return validated | |
def optimal_hamming(barcodes, min_hamming, barcode_len, verbose): | |
''' determine whether any set of 2 barcodes is less than some minimum | |
hamming distance ''' | |
# hamming dist is too stringent for barcodes <= 5 | |
if barcode_len <= 5: return True | |
for this, other in combinations(barcodes, r=2): | |
dist = hamming_dist(this, other) | |
if dist < min_hamming: | |
if verbose: | |
print >>sys.stderr, ">> hamming not satisfied..." | |
return False | |
return True | |
def hamming_dist(this, other): | |
''' calc hamming distance (i.e. number of characters different between | |
two strings)''' | |
assert len(this) == len(other) | |
dist = 0 | |
for pos, char in enumerate(this): | |
if other[pos] != char: | |
dist += 1 | |
return dist | |
def optimal_cycle_content(barcodes, num_cycles, min_base_prop, verbose): | |
''' select set with optimal cycle content i.e. proportional nucleotide | |
content at each position ''' | |
num_barcodes = float(len(barcodes)) | |
for pos in range(num_cycles): | |
nuc_comps = Counter() | |
for barcode in barcodes: | |
nuc = barcode[pos] | |
nuc_comps[nuc] += 1 | |
vals = nuc_comps.values() | |
min_val = min(vals) | |
base_prop = float(min_val) / float(num_barcodes) | |
if base_prop < min_base_prop: | |
if verbose: | |
print >>sys.stderr, ">> cycle content not satisfied..." | |
return False | |
return True | |
def validate_barcodes(barcodes): | |
''' master func for all checks ''' | |
validated = [] | |
for barcode in barcodes: | |
if not check_homopolymer(barcode): | |
continue | |
if not check_gc_content(barcode): | |
continue | |
if not check_composition(barcode): | |
continue | |
validated.append(barcode) | |
return validated | |
def check_homopolymer(barcode): | |
''' checks for homopolymer runs in barcode ''' | |
dinucs = set(['AA','CC','TT','GG']) | |
for idx, nuc in enumerate(barcode): | |
dinuc = barcode[idx:idx+2] | |
if dinuc in dinucs: | |
return False | |
return True | |
def check_gc_content(barcode, min_gc=0.4): | |
''' determines whether gc_content of barcode is less than some minimum | |
gc content''' | |
num_gc = float(len([i for i in barcode if i in 'GC'])) | |
gc_cont = num_gc / float(len(barcode)) | |
if gc_cont < min_gc: | |
return False | |
return True | |
def check_composition(barcode, min_nucs=3): | |
''' determines whether composition i.e. number of bases represented is | |
less than some minimum number ''' | |
uniq_nucs = dict.fromkeys([i for i in barcode]) | |
num_nucs = len(uniq_nucs.keys()) | |
if num_nucs < min_nucs: | |
return False | |
return True | |
def generate_barcodes(length): | |
nucs = 'ACTG' | |
return [''.join(i) for i in product(nucs,repeat=length)] | |
def parse_options(args): | |
from optparse import OptionParser | |
parser = OptionParser() | |
parser.add_option('-v','--verbose',action='store_true') | |
parser.add_option('-l','--barcode-length',action='append', | |
default=[]) | |
parser.add_option('-n','--number-barcodes',action='store', | |
type='int', | |
help='number of bar codes (default: %default)', | |
default=32) | |
parser.add_option('-t','--truseq',action='store_true', | |
help='Design nested Truseq adaptors (default: %default)', | |
default=False) | |
parser.add_option('-m','--minimum-hamming',action='store', | |
type='int', dest='min_hamming', | |
help='minimum hamming distance (default: %default)', | |
default=1) | |
parser.add_option('-c','--minimum-base-proportion',action='store', | |
type='float', dest='min_base_prop', | |
help='minimum base proportion (default: %default)', | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment