anthonykasza/kmer_sequencer.py

## kmer_sequencer.py
import fileinput
from StringIO import StringIO
import networkx as nx
import subprocess
import glob
import base64 as b64


# http://stackoverflow.com/questions/1285434/efficient-algorithm-for-string-concatenation-with-overlap
def concat(*args):
  result = ''
  for arg in args:
    result = _concat(result, arg)
  return result

def _concat(a, b):
  la = len(a)
  lb = len(b)
  for i in range(la):
    j = i
    k = 0
    while j < la and k < lb and a[j] == b[k]:
      j += 1
      k += 1
    if j == la:
      n = k
      break
  else:
    n = 0
  return a + b[n:]


# https://neil.fraser.name/news/2010/11/04/
def commonOverlapIndexOf(text1, text2):
  text1_length = len(text1)
  text2_length = len(text2)
  if text1_length == 0 or text2_length == 0:
    return 0
  if text1_length > text2_length:
    text1 = text1[-text2_length:]
  elif text1_length < text2_length:
    text2 = text2[:text1_length]
  if text1 == text2:
    return min(text1_length, text2_length)
  best = 0
  length = 1
  while True:
    pattern = text1[-length:]
    found = text2.find(pattern)
    if found == -1:
      return best
    length += found
    if text1[-length:] == text2[:length]:
      best = length
      length += 1


def visualize(g):
  dot_file = "./graph.dot"
  fname = "./graph.png"
  png_file = open(fname, 'w')
  nx.write_dot(g, dot_file)
  proc = subprocess.Popen(["dot", "-Tpng", dot_file], stdout=png_file)
  proc.wait()


kmers = {}
for kmer in glob.glob("kmers/*.bin"):
  with open(kmer, 'rb') as f:
    kmers[kmer] = f.read()

minimum_overlap_length = 15
g = nx.DiGraph()
for name1,kmer1 in kmers.iteritems():
  for name2,kmer2 in kmers.iteritems():
    if kmer1 == kmer2 or kmer1 in kmer2 or kmer2 in kmer1:
      continue
    best = commonOverlapIndexOf(kmer1, kmer2)
    if best > minimum_overlap_length:
      g.add_edge(name1, name2, overlap_idx=best)
    best = commonOverlapIndexOf(kmer2, kmer1)
    if best > minimum_overlap_length:
      g.add_edge(name2, name1, overlap_idx=best)

l = nx.line_graph(g)
#visualize(l)

# identify the head of the chain
head = False
for ok1, ok2 in l.nodes():
  for ik1, ik2 in l.nodes():
    if ok1 == ik2:
      head = False
      break
    head = True
  if head == True:
    head_link = (ok1, ok2)

# identify the tail of the chain
tail = False
for ok1, ok2 in l.nodes():
  for ik1, ik2 in l.nodes():
    if ok2 == ik1:
      tail = False
      break
    tail = True
  if tail == True:
    tail_link = (ok1, ok2)

# iterate over the remaining nodes in the least efficient way possible
#  to locate the next link in the chain and recreate the entire chain
chain = []
nodes = l.nodes()
chain.append(head_link)
nodes.remove(head_link)
while len(nodes) > 1:
  for n1,n2 in nodes:
    if n1 == chain[len(chain)-1][1]:
      chain.append( (n1,n2) )
      # we are mutating the what we iterate over from within our loop
      #  THIS CAN BE DANGEROUS
      nodes.remove( (n1,n2) )
      break

# the only difference is our tail_link which we add later
#print set(set(l.nodes())).difference(set(chain))
#print tail_link

# flatten the chain
o = []
for n1,n2 in chain:
  o.append(n1)
# append the tail to the chain
o.append(tail_link[0])
o.append(tail_link[1])

# the list 'o' now cotains the order of the *.bin files *BUT*
#  each file overlaps with other files, meaning we need to
#  identify the overlap and ensure it is only within our final
#  data once. Each *.bin takes the form of:
#    [10-15 header bytes] [200-300 body bytes] [10-15 trailer bytes]
#  Where header and trailer bytes overlap with other *.bin files.
#  Concatinating all the *.bin files together means header and trailer
#  byte overlaps are included twice.

everything = ''
for each in o:
  with open(each, 'rb') as f:
    everything = concat(everything, f.read())

with open("solution.zip", 'wb') as f:
  f.write(b64.b64decode(everything))
	import fileinput
	from StringIO import StringIO
	import networkx as nx
	import subprocess
	import glob
	import base64 as b64


	# http://stackoverflow.com/questions/1285434/efficient-algorithm-for-string-concatenation-with-overlap
	def concat(*args):
	result = ''
	for arg in args:
	result = _concat(result, arg)
	return result

	def _concat(a, b):
	la = len(a)
	lb = len(b)
	for i in range(la):
	j = i
	k = 0
	while j < la and k < lb and a[j] == b[k]:
	j += 1
	k += 1
	if j == la:
	n = k
	break
	else:
	n = 0
	return a + b[n:]



	# https://neil.fraser.name/news/2010/11/04/
	def commonOverlapIndexOf(text1, text2):
	text1_length = len(text1)
	text2_length = len(text2)
	if text1_length == 0 or text2_length == 0:
	return 0
	if text1_length > text2_length:
	text1 = text1[-text2_length:]
	elif text1_length < text2_length:
	text2 = text2[:text1_length]
	if text1 == text2:
	return min(text1_length, text2_length)
	best = 0
	length = 1
	while True:
	pattern = text1[-length:]
	found = text2.find(pattern)
	if found == -1:
	return best
	length += found
	if text1[-length:] == text2[:length]:
	best = length
	length += 1


	def visualize(g):
	dot_file = "./graph.dot"
	fname = "./graph.png"
	png_file = open(fname, 'w')
	nx.write_dot(g, dot_file)
	proc = subprocess.Popen(["dot", "-Tpng", dot_file], stdout=png_file)
	proc.wait()


	kmers = {}
	for kmer in glob.glob("kmers/*.bin"):
	with open(kmer, 'rb') as f:
	kmers[kmer] = f.read()

	minimum_overlap_length = 15
	g = nx.DiGraph()
	for name1,kmer1 in kmers.iteritems():
	for name2,kmer2 in kmers.iteritems():
	if kmer1 == kmer2 or kmer1 in kmer2 or kmer2 in kmer1:
	continue
	best = commonOverlapIndexOf(kmer1, kmer2)
	if best > minimum_overlap_length:
	g.add_edge(name1, name2, overlap_idx=best)
	best = commonOverlapIndexOf(kmer2, kmer1)
	if best > minimum_overlap_length:
	g.add_edge(name2, name1, overlap_idx=best)

	l = nx.line_graph(g)
	#visualize(l)

	# identify the head of the chain
	head = False
	for ok1, ok2 in l.nodes():
	for ik1, ik2 in l.nodes():
	if ok1 == ik2:
	head = False
	break
	head = True
	if head == True:
	head_link = (ok1, ok2)

	# identify the tail of the chain
	tail = False
	for ok1, ok2 in l.nodes():
	for ik1, ik2 in l.nodes():
	if ok2 == ik1:
	tail = False
	break
	tail = True
	if tail == True:
	tail_link = (ok1, ok2)

	# iterate over the remaining nodes in the least efficient way possible
	# to locate the next link in the chain and recreate the entire chain
	chain = []
	nodes = l.nodes()
	chain.append(head_link)
	nodes.remove(head_link)
	while len(nodes) > 1:
	for n1,n2 in nodes:
	if n1 == chain[len(chain)-1][1]:
	chain.append( (n1,n2) )
	# we are mutating the what we iterate over from within our loop
	# THIS CAN BE DANGEROUS
	nodes.remove( (n1,n2) )
	break

	# the only difference is our tail_link which we add later
	#print set(set(l.nodes())).difference(set(chain))
	#print tail_link

	# flatten the chain
	o = []
	for n1,n2 in chain:
	o.append(n1)
	# append the tail to the chain
	o.append(tail_link[0])
	o.append(tail_link[1])

	# the list 'o' now cotains the order of the .bin files BUT*
	# each file overlaps with other files, meaning we need to
	# identify the overlap and ensure it is only within our final
	# data once. Each *.bin takes the form of:
	# [10-15 header bytes] [200-300 body bytes] [10-15 trailer bytes]
	# Where header and trailer bytes overlap with other *.bin files.
	# Concatinating all the *.bin files together means header and trailer
	# byte overlaps are included twice.

	everything = ''
	for each in o:
	with open(each, 'rb') as f:
	everything = concat(everything, f.read())

	with open("solution.zip", 'wb') as f:
	f.write(b64.b64decode(everything))