Created
September 12, 2016 16:24
-
-
Save anthonykasza/19db4d5269b7427e1eefd7d885325a77 to your computer and use it in GitHub Desktop.
Script for aligning and reconstructing overlapping subsequences - LabyREnth Random 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fileinput | |
from StringIO import StringIO | |
import networkx as nx | |
import subprocess | |
import glob | |
import base64 as b64 | |
# http://stackoverflow.com/questions/1285434/efficient-algorithm-for-string-concatenation-with-overlap | |
def concat(*args): | |
result = '' | |
for arg in args: | |
result = _concat(result, arg) | |
return result | |
def _concat(a, b): | |
la = len(a) | |
lb = len(b) | |
for i in range(la): | |
j = i | |
k = 0 | |
while j < la and k < lb and a[j] == b[k]: | |
j += 1 | |
k += 1 | |
if j == la: | |
n = k | |
break | |
else: | |
n = 0 | |
return a + b[n:] | |
# https://neil.fraser.name/news/2010/11/04/ | |
def commonOverlapIndexOf(text1, text2): | |
text1_length = len(text1) | |
text2_length = len(text2) | |
if text1_length == 0 or text2_length == 0: | |
return 0 | |
if text1_length > text2_length: | |
text1 = text1[-text2_length:] | |
elif text1_length < text2_length: | |
text2 = text2[:text1_length] | |
if text1 == text2: | |
return min(text1_length, text2_length) | |
best = 0 | |
length = 1 | |
while True: | |
pattern = text1[-length:] | |
found = text2.find(pattern) | |
if found == -1: | |
return best | |
length += found | |
if text1[-length:] == text2[:length]: | |
best = length | |
length += 1 | |
def visualize(g): | |
dot_file = "./graph.dot" | |
fname = "./graph.png" | |
png_file = open(fname, 'w') | |
nx.write_dot(g, dot_file) | |
proc = subprocess.Popen(["dot", "-Tpng", dot_file], stdout=png_file) | |
proc.wait() | |
kmers = {} | |
for kmer in glob.glob("kmers/*.bin"): | |
with open(kmer, 'rb') as f: | |
kmers[kmer] = f.read() | |
minimum_overlap_length = 15 | |
g = nx.DiGraph() | |
for name1,kmer1 in kmers.iteritems(): | |
for name2,kmer2 in kmers.iteritems(): | |
if kmer1 == kmer2 or kmer1 in kmer2 or kmer2 in kmer1: | |
continue | |
best = commonOverlapIndexOf(kmer1, kmer2) | |
if best > minimum_overlap_length: | |
g.add_edge(name1, name2, overlap_idx=best) | |
best = commonOverlapIndexOf(kmer2, kmer1) | |
if best > minimum_overlap_length: | |
g.add_edge(name2, name1, overlap_idx=best) | |
l = nx.line_graph(g) | |
#visualize(l) | |
# identify the head of the chain | |
head = False | |
for ok1, ok2 in l.nodes(): | |
for ik1, ik2 in l.nodes(): | |
if ok1 == ik2: | |
head = False | |
break | |
head = True | |
if head == True: | |
head_link = (ok1, ok2) | |
# identify the tail of the chain | |
tail = False | |
for ok1, ok2 in l.nodes(): | |
for ik1, ik2 in l.nodes(): | |
if ok2 == ik1: | |
tail = False | |
break | |
tail = True | |
if tail == True: | |
tail_link = (ok1, ok2) | |
# iterate over the remaining nodes in the least efficient way possible | |
# to locate the next link in the chain and recreate the entire chain | |
chain = [] | |
nodes = l.nodes() | |
chain.append(head_link) | |
nodes.remove(head_link) | |
while len(nodes) > 1: | |
for n1,n2 in nodes: | |
if n1 == chain[len(chain)-1][1]: | |
chain.append( (n1,n2) ) | |
# we are mutating the what we iterate over from within our loop | |
# THIS CAN BE DANGEROUS | |
nodes.remove( (n1,n2) ) | |
break | |
# the only difference is our tail_link which we add later | |
#print set(set(l.nodes())).difference(set(chain)) | |
#print tail_link | |
# flatten the chain | |
o = [] | |
for n1,n2 in chain: | |
o.append(n1) | |
# append the tail to the chain | |
o.append(tail_link[0]) | |
o.append(tail_link[1]) | |
# the list 'o' now cotains the order of the *.bin files *BUT* | |
# each file overlaps with other files, meaning we need to | |
# identify the overlap and ensure it is only within our final | |
# data once. Each *.bin takes the form of: | |
# [10-15 header bytes] [200-300 body bytes] [10-15 trailer bytes] | |
# Where header and trailer bytes overlap with other *.bin files. | |
# Concatinating all the *.bin files together means header and trailer | |
# byte overlaps are included twice. | |
everything = '' | |
for each in o: | |
with open(each, 'rb') as f: | |
everything = concat(everything, f.read()) | |
with open("solution.zip", 'wb') as f: | |
f.write(b64.b64decode(everything)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment