Skip to content

Instantly share code, notes, and snippets.

@elgartmi
Last active January 28, 2017 16:08
Show Gist options
  • Save elgartmi/58f82656ea503566f12196aef10ec4e0 to your computer and use it in GitHub Desktop.
Save elgartmi/58f82656ea503566f12196aef10ec4e0 to your computer and use it in GitHub Desktop.
Identify Nextera barcodes from Illumina Index FASTQ files (without prior knowledge)
from __future__ import print_function
from itertools import islice
import sys
Nextera_S_barcodes=['CTCTCTAT','GCGTAAGA','TATCCTCT','AGAGTAGA','GTAAGGAG','ACTGCATA','AAGGAGTA','CTAAGCCT']
Nextera_N_barcodes=['TAAGGCGA','CGTACTAG','AGGCAGAA','TCCTGAGC','GGACTCCT','TAGGCATG','CTCTCTAC','CAGAGAGG',
'GCTACGCT','CGAGGCTG','AAGAGGCA','GTAGAGGA']
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def levenshtein(a, b):
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a, b = b, a
n, m = m, n
current = range(n + 1)
for i in range(1, m + 1):
previous, current = current, [i] + [0] * n
for j in range(1, n + 1):
add, delete = previous[j] + 1, current[j - 1] + 1
change = previous[j - 1]
if a[j - 1] != b[i - 1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]
try:
from Levenshtein import distance
lev_dist=distance
eprint("Levenshtein module found, using it for speedup")
except:
lev_dist=levenshtein
eprint("Levenshtein module not found, using slow python alternative")
def ScanIndexFiles(I1,I2,barcodelength):
data={}
tRecords=0
with open(I1) as i1, open(I2) as i2:
while True:
next_record_1 = list(islice(i1, 4))
next_record_2 = list(islice(i2, 4))
if not next_record_1:
break
if not next_record_2:
print("New record in file 1, but not in file 2. Record %d"%(tRecords))
return
if next_record_1[0][0]!='@':
print("Supposed to be New record in file 1, but is not. Record %d"%(tRecords))
return
if next_record_2[0][0] != '@':
print("Supposed to be New record in file 2, but is not. Record %d" % (tRecords))
return
tBc="%s-%s"%(next_record_1[1][:barcodelength],next_record_2[1][:barcodelength])
if not data.has_key(tBc):
data[tBc]=0
data[tBc] += 1
tRecords+=1
if not (tRecords%10000):
eprint("Processed %d sequences"%(tRecords))
return data
def CollapseSimilar(data,dist,minoccur):
ndata={}
ndatal={}
ndatar={}
cnt=0
eprint("Collapsing %d barcodes into similar by distance %d"%(len(data),dist))
for w in sorted(data, key=data.get, reverse=True):
if data[w]>minoccur:
tl=w.split('-')[0]
tr=w.split('-')[1]
il=""
for l in ndatal.keys():
if lev_dist(l,tl)<=dist:
if il=="":
il=l
else: #if found more than one similar
il="" #its like didn't find anything - can't collapse
break
ir = ""
for r in ndatar.keys():
if lev_dist(r, tr) <= dist:
if ir == "":
ir = r
else: # if found more than one similar
ir = "" # its like didn't find anything - can't collapse
break
if ((ir!="")and(il!="")):
bc="%s-%s"%(il,ir)
if not ndata.has_key(bc):
ndata[bc]={}
ndata[bc]["count"]=data[w]
ndata[bc]["mismatches"] = {}
ndatal[il] = 1
ndatar[ir] = 1
else:
ndata[bc]["count"] += data[w]
ndata[bc]["mismatches"][w]=data[w]
else:
bc=w
if ndata.has_key(bc):
print("Encountered an unexpected key (%s), probably algorithm bug "%(bc))
return
ndata[bc]={}
ndata[bc]["count"]=data[w]
ndata[bc]["mismatches"]={}
ndatal[tl] = 1
ndatar[tr] = 1
cnt+=1
if not (cnt % 100):
eprint("Processed %f %%" % (100.0*cnt/len(data)))
return ndata
def read_processedfile(fname):
data={}
with open(fname) as f:
for l in f:
if ((l.startswith("Processed"))or(l.startswith("Done."))or(l=="\n\n\nCollapsed\n\n")):
continue
else:
d=l.strip().split(' ')
if ((len(d)!=2)or(len(d[0].split('-'))!=2)or(len(d[0].split('-')[0])!=len(d[0].split('-')[1]))):
print("Unknown format")
return
else:
data[d[0]]=int(d[1])
return data
if __name__ == "__main__":
if len(sys.argv)<5:
print("usage: Generate_runscripts_v1.py IndexFile1 IndexFile2 barcode_length allowedmutations min_occurrences")
exit()
I1f = sys.argv[1]
I2f = sys.argv[2]
NexteraBarcodeL = int(sys.argv[3])
okmut= int(sys.argv[4])
min_occur = int(sys.argv[5])
data=ScanIndexFiles(I1f,I2f,NexteraBarcodeL)
for w in sorted(data, key=data.get, reverse=True):
if data[w]>min_occur:
print(w, data[w])
else:
break
print("\n\n\nCollapsed\n\n")
coldata=CollapseSimilar(data,okmut,min_occur)
sd={}
for i in coldata.keys():
sd[i]=coldata[i]["count"]
for w in sorted(sd, key=sd.get, reverse=True):
ts=""
for i in sorted(coldata[w]["mismatches"], key=coldata[w]["mismatches"].get, reverse=True):
ts="%s\t%s\t%d"%(ts,i,coldata[w]["mismatches"][i])
print("%s\t%d\t%d\t%s"%(w,coldata[w]["count"],len(coldata[w]["mismatches"]),ts))
eprint("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment