Last active
January 28, 2017 16:08
-
-
Save elgartmi/58f82656ea503566f12196aef10ec4e0 to your computer and use it in GitHub Desktop.
Identify Nextera barcodes from Illumina Index FASTQ files (without prior knowledge)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from itertools import islice | |
import sys | |
Nextera_S_barcodes=['CTCTCTAT','GCGTAAGA','TATCCTCT','AGAGTAGA','GTAAGGAG','ACTGCATA','AAGGAGTA','CTAAGCCT'] | |
Nextera_N_barcodes=['TAAGGCGA','CGTACTAG','AGGCAGAA','TCCTGAGC','GGACTCCT','TAGGCATG','CTCTCTAC','CAGAGAGG', | |
'GCTACGCT','CGAGGCTG','AAGAGGCA','GTAGAGGA'] | |
def eprint(*args, **kwargs): | |
print(*args, file=sys.stderr, **kwargs) | |
def levenshtein(a, b): | |
n, m = len(a), len(b) | |
if n > m: | |
# Make sure n <= m, to use O(min(n,m)) space | |
a, b = b, a | |
n, m = m, n | |
current = range(n + 1) | |
for i in range(1, m + 1): | |
previous, current = current, [i] + [0] * n | |
for j in range(1, n + 1): | |
add, delete = previous[j] + 1, current[j - 1] + 1 | |
change = previous[j - 1] | |
if a[j - 1] != b[i - 1]: | |
change = change + 1 | |
current[j] = min(add, delete, change) | |
return current[n] | |
try: | |
from Levenshtein import distance | |
lev_dist=distance | |
eprint("Levenshtein module found, using it for speedup") | |
except: | |
lev_dist=levenshtein | |
eprint("Levenshtein module not found, using slow python alternative") | |
def ScanIndexFiles(I1,I2,barcodelength): | |
data={} | |
tRecords=0 | |
with open(I1) as i1, open(I2) as i2: | |
while True: | |
next_record_1 = list(islice(i1, 4)) | |
next_record_2 = list(islice(i2, 4)) | |
if not next_record_1: | |
break | |
if not next_record_2: | |
print("New record in file 1, but not in file 2. Record %d"%(tRecords)) | |
return | |
if next_record_1[0][0]!='@': | |
print("Supposed to be New record in file 1, but is not. Record %d"%(tRecords)) | |
return | |
if next_record_2[0][0] != '@': | |
print("Supposed to be New record in file 2, but is not. Record %d" % (tRecords)) | |
return | |
tBc="%s-%s"%(next_record_1[1][:barcodelength],next_record_2[1][:barcodelength]) | |
if not data.has_key(tBc): | |
data[tBc]=0 | |
data[tBc] += 1 | |
tRecords+=1 | |
if not (tRecords%10000): | |
eprint("Processed %d sequences"%(tRecords)) | |
return data | |
def CollapseSimilar(data,dist,minoccur): | |
ndata={} | |
ndatal={} | |
ndatar={} | |
cnt=0 | |
eprint("Collapsing %d barcodes into similar by distance %d"%(len(data),dist)) | |
for w in sorted(data, key=data.get, reverse=True): | |
if data[w]>minoccur: | |
tl=w.split('-')[0] | |
tr=w.split('-')[1] | |
il="" | |
for l in ndatal.keys(): | |
if lev_dist(l,tl)<=dist: | |
if il=="": | |
il=l | |
else: #if found more than one similar | |
il="" #its like didn't find anything - can't collapse | |
break | |
ir = "" | |
for r in ndatar.keys(): | |
if lev_dist(r, tr) <= dist: | |
if ir == "": | |
ir = r | |
else: # if found more than one similar | |
ir = "" # its like didn't find anything - can't collapse | |
break | |
if ((ir!="")and(il!="")): | |
bc="%s-%s"%(il,ir) | |
if not ndata.has_key(bc): | |
ndata[bc]={} | |
ndata[bc]["count"]=data[w] | |
ndata[bc]["mismatches"] = {} | |
ndatal[il] = 1 | |
ndatar[ir] = 1 | |
else: | |
ndata[bc]["count"] += data[w] | |
ndata[bc]["mismatches"][w]=data[w] | |
else: | |
bc=w | |
if ndata.has_key(bc): | |
print("Encountered an unexpected key (%s), probably algorithm bug "%(bc)) | |
return | |
ndata[bc]={} | |
ndata[bc]["count"]=data[w] | |
ndata[bc]["mismatches"]={} | |
ndatal[tl] = 1 | |
ndatar[tr] = 1 | |
cnt+=1 | |
if not (cnt % 100): | |
eprint("Processed %f %%" % (100.0*cnt/len(data))) | |
return ndata | |
def read_processedfile(fname): | |
data={} | |
with open(fname) as f: | |
for l in f: | |
if ((l.startswith("Processed"))or(l.startswith("Done."))or(l=="\n\n\nCollapsed\n\n")): | |
continue | |
else: | |
d=l.strip().split(' ') | |
if ((len(d)!=2)or(len(d[0].split('-'))!=2)or(len(d[0].split('-')[0])!=len(d[0].split('-')[1]))): | |
print("Unknown format") | |
return | |
else: | |
data[d[0]]=int(d[1]) | |
return data | |
if __name__ == "__main__": | |
if len(sys.argv)<5: | |
print("usage: Generate_runscripts_v1.py IndexFile1 IndexFile2 barcode_length allowedmutations min_occurrences") | |
exit() | |
I1f = sys.argv[1] | |
I2f = sys.argv[2] | |
NexteraBarcodeL = int(sys.argv[3]) | |
okmut= int(sys.argv[4]) | |
min_occur = int(sys.argv[5]) | |
data=ScanIndexFiles(I1f,I2f,NexteraBarcodeL) | |
for w in sorted(data, key=data.get, reverse=True): | |
if data[w]>min_occur: | |
print(w, data[w]) | |
else: | |
break | |
print("\n\n\nCollapsed\n\n") | |
coldata=CollapseSimilar(data,okmut,min_occur) | |
sd={} | |
for i in coldata.keys(): | |
sd[i]=coldata[i]["count"] | |
for w in sorted(sd, key=sd.get, reverse=True): | |
ts="" | |
for i in sorted(coldata[w]["mismatches"], key=coldata[w]["mismatches"].get, reverse=True): | |
ts="%s\t%s\t%d"%(ts,i,coldata[w]["mismatches"][i]) | |
print("%s\t%d\t%d\t%s"%(w,coldata[w]["count"],len(coldata[w]["mismatches"]),ts)) | |
eprint("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment