Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script for aligning and reconstructing overlapping subsequences - LabyREnth Random 3
import fileinput
from StringIO import StringIO
import networkx as nx
import subprocess
import glob
import base64 as b64
# http://stackoverflow.com/questions/1285434/efficient-algorithm-for-string-concatenation-with-overlap
def concat(*args):
result = ''
for arg in args:
result = _concat(result, arg)
return result
def _concat(a, b):
la = len(a)
lb = len(b)
for i in range(la):
j = i
k = 0
while j < la and k < lb and a[j] == b[k]:
j += 1
k += 1
if j == la:
n = k
break
else:
n = 0
return a + b[n:]
# https://neil.fraser.name/news/2010/11/04/
def commonOverlapIndexOf(text1, text2):
text1_length = len(text1)
text2_length = len(text2)
if text1_length == 0 or text2_length == 0:
return 0
if text1_length > text2_length:
text1 = text1[-text2_length:]
elif text1_length < text2_length:
text2 = text2[:text1_length]
if text1 == text2:
return min(text1_length, text2_length)
best = 0
length = 1
while True:
pattern = text1[-length:]
found = text2.find(pattern)
if found == -1:
return best
length += found
if text1[-length:] == text2[:length]:
best = length
length += 1
def visualize(g):
dot_file = "./graph.dot"
fname = "./graph.png"
png_file = open(fname, 'w')
nx.write_dot(g, dot_file)
proc = subprocess.Popen(["dot", "-Tpng", dot_file], stdout=png_file)
proc.wait()
kmers = {}
for kmer in glob.glob("kmers/*.bin"):
with open(kmer, 'rb') as f:
kmers[kmer] = f.read()
minimum_overlap_length = 15
g = nx.DiGraph()
for name1,kmer1 in kmers.iteritems():
for name2,kmer2 in kmers.iteritems():
if kmer1 == kmer2 or kmer1 in kmer2 or kmer2 in kmer1:
continue
best = commonOverlapIndexOf(kmer1, kmer2)
if best > minimum_overlap_length:
g.add_edge(name1, name2, overlap_idx=best)
best = commonOverlapIndexOf(kmer2, kmer1)
if best > minimum_overlap_length:
g.add_edge(name2, name1, overlap_idx=best)
l = nx.line_graph(g)
#visualize(l)
# identify the head of the chain
head = False
for ok1, ok2 in l.nodes():
for ik1, ik2 in l.nodes():
if ok1 == ik2:
head = False
break
head = True
if head == True:
head_link = (ok1, ok2)
# identify the tail of the chain
tail = False
for ok1, ok2 in l.nodes():
for ik1, ik2 in l.nodes():
if ok2 == ik1:
tail = False
break
tail = True
if tail == True:
tail_link = (ok1, ok2)
# iterate over the remaining nodes in the least efficient way possible
# to locate the next link in the chain and recreate the entire chain
chain = []
nodes = l.nodes()
chain.append(head_link)
nodes.remove(head_link)
while len(nodes) > 1:
for n1,n2 in nodes:
if n1 == chain[len(chain)-1][1]:
chain.append( (n1,n2) )
# we are mutating the what we iterate over from within our loop
# THIS CAN BE DANGEROUS
nodes.remove( (n1,n2) )
break
# the only difference is our tail_link which we add later
#print set(set(l.nodes())).difference(set(chain))
#print tail_link
# flatten the chain
o = []
for n1,n2 in chain:
o.append(n1)
# append the tail to the chain
o.append(tail_link[0])
o.append(tail_link[1])
# the list 'o' now cotains the order of the *.bin files *BUT*
# each file overlaps with other files, meaning we need to
# identify the overlap and ensure it is only within our final
# data once. Each *.bin takes the form of:
# [10-15 header bytes] [200-300 body bytes] [10-15 trailer bytes]
# Where header and trailer bytes overlap with other *.bin files.
# Concatinating all the *.bin files together means header and trailer
# byte overlaps are included twice.
everything = ''
for each in o:
with open(each, 'rb') as f:
everything = concat(everything, f.read())
with open("solution.zip", 'wb') as f:
f.write(b64.b64decode(everything))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.