Freaken/join_hexes.py

## join_hexes.py
#!/usr/bin/python

from binascii import unhexlify
from base64 import b64decode
import urllib
import re
import string
import sys

# Download the hexes from the reddit thread
data = urllib.urlopen("https://raw.github.com/gist/4a65eadfbdc81b5eeb43/3591eb5066e015f6a4dedf7ff3295eab9a3fee09/hexes")
data = data.read()
data = re.findall(r"\b[0-9a-f]{100}\b", data)
data = sorted(list(set(data)))

# Remove some bad hexes (posted by myself)
data.remove("41674c4455674c4463674c444d674c4441674c4441674c4463674c4459674c4449674c4459674c4459674c4445674e545173")
data.remove("41314e797773494341794e537773494341304e697773494341794a79786d494363774c437767494463324b53773749434178")

# Finds the indexes of the first and last hex (which is known to be in the thread)
try:
    first = data.index("497a78705032357759326873634855675a43526c595341675044317a494852685a484a70636d39684c6e6c6f4b4434354369")
    last  = data.index("4270494630674f3349675a5831304948566c636d4e75614342764d4341374a79426c494363674f7941674944393950676f3d")
except:
    print "Something went wrong with downloading the hexes from the reddit thread!"
    sys.exit(-1)

# For comparing how much of an overlap there is between two strings
def like(a, b):
    if a == b: return -1

    for n in range(min(len(a), len(b)), 0, -2):
        if a[-n:] == b[:n]:
            return n
    return 0

# These are used a lot, defined for convenience (and speed)
indexes  = range(len(data))
likeness = [[like(data[a],data[b]) for b in indexes] for a in indexes]

# Create a tables of (index, likeness) for the best possible next and previous indexes
next = [max(indexes, key=lambda b: likeness[a][b]) for a in indexes]
prev = [max(indexes, key=lambda a: likeness[a][b]) for b in indexes]

# Ignore overlaps in with than 50 hex chars
for a in indexes:
    if likeness[a][next[a]] < 50:
        next[a] = None
    if likeness[prev[a]][a] < 50:
        prev[a] = None

# We know a bit more about last and first
next[last] = None
prev[first] = None

# Try to figure out if there is a "perfect" best match, that does not conflict with other matches
perfect_prev = [True for _ in indexes]
perfect_next = [True for _ in indexes]

for a in indexes:
    if next[a] == None:
        perfect_next[a] = False
    elif prev[next[a]] != a:
        perfect_next[a] = False
        perfect_prev[next[a]] = False

    if prev[a] == None:
        perfect_prev[a] = False
    elif next[prev[a]] != a:
        perfect_prev[a] = False
        perfect_next[prev[a]] = False

# If it was not a perfect match, remove it
for a in indexes:
    if not (perfect_next[a] and perfect_prev[next[a]]):
        next[a] = None
    if not (perfect_prev[a] and perfect_next[prev[a]]):
        prev[a] = None

# Now put all the indexes into chains
chained = [False for _ in indexes]

FORWARD = 1
BACKWARDS = 2
def get_chain(i, direction):
    if i == None:
        return []

    chained[i] = True
    if direction == FORWARD:
        return [i] + get_chain(next[i], direction)
    else:
        return get_chain(prev[i], direction) + [i]

def get_full_chain(i):
    return get_chain(i, BACKWARDS) + get_chain(next[i], FORWARD)

# Make sure that the known first chain comes first
chains = [get_full_chain(first)]

# And the the known last chain comes last
if not chained[last]:
    chains.append(get_full_chain(last))

for i in indexes:
    if not chained[i]:
        chains.insert(1, get_full_chain(i))

combined_chains = []

# Loop over the chains and print the result
for chain in chains:

    # Create the full hex value of the chain
    current = data[chain[0]]
    for (a, b) in zip(chain, chain[1:]):
        shared = likeness[a][b]
        current += data[b][shared:]

    combined_chains.append(current)

# Combine the chains using a greedy algorithm
while len(combined_chains) > 1:
    a,b = max([(a,b) for a in range(len(combined_chains)) for b in range(len(combined_chains))], key=lambda (a,b): like(combined_chains[a],combined_chains[b]))
    if like(combined_chains[a], combined_chains[b]) < 8:
        break

    minval = min(a,b)
    maxval = max(a,b)

    adata = combined_chains[a]
    bdata = combined_chains[b]

    combined_chains.pop(maxval)

    combined_chains[minval] = adata + bdata[like(adata, bdata):]

for chain in combined_chains:
    # Unhex it
    unhexed = unhexlify(chain)

    # base64 only works, when the data is aligned correctly
    # to a 4 byte boundary. It also needs the entire string
    # to be of this length, though this can be fixed by
    # appending "="

    def fix(n):
        try:
            padded1 = "A" * n + unhexed
            padded2 = padded1 + "=" * ((4 - len(padded1) % 4) % 4)
            return b64decode(padded2)[n:]
        except:
            return "\xff" * len(unhexed)

    # Try all possible alignments
    possible = [fix(n) for n in range(4)]

    # And find the one with the least number of printable characters
    possible.sort(key = lambda s: sum(c not in string.printable for c in s))

    print ""
    print chain
    print "Stream 1: " + possible[0][::2]
    print "Stream 2: " + possible[0][1::2]
	#!/usr/bin/python

	from binascii import unhexlify
	from base64 import b64decode
	import urllib
	import re
	import string
	import sys

	# Download the hexes from the reddit thread
	data = urllib.urlopen("https://raw.github.com/gist/4a65eadfbdc81b5eeb43/3591eb5066e015f6a4dedf7ff3295eab9a3fee09/hexes")
	data = data.read()
	data = re.findall(r"\b[0-9a-f]{100}\b", data)
	data = sorted(list(set(data)))

	# Remove some bad hexes (posted by myself)
	data.remove("41674c4455674c4463674c444d674c4441674c4441674c4463674c4459674c4449674c4459674c4459674c4445674e545173")
	data.remove("41314e797773494341794e537773494341304e697773494341794a79786d494363774c437767494463324b53773749434178")

	# Finds the indexes of the first and last hex (which is known to be in the thread)
	try:
	first = data.index("497a78705032357759326873634855675a43526c595341675044317a494852685a484a70636d39684c6e6c6f4b4434354369")
	last = data.index("4270494630674f3349675a5831304948566c636d4e75614342764d4341374a79426c494363674f7941674944393950676f3d")
	except:
	print "Something went wrong with downloading the hexes from the reddit thread!"
	sys.exit(-1)

	# For comparing how much of an overlap there is between two strings
	def like(a, b):
	if a == b: return -1

	for n in range(min(len(a), len(b)), 0, -2):
	if a[-n:] == b[:n]:
	return n
	return 0

	# These are used a lot, defined for convenience (and speed)
	indexes = range(len(data))
	likeness = [[like(data[a],data[b]) for b in indexes] for a in indexes]

	# Create a tables of (index, likeness) for the best possible next and previous indexes
	next = [max(indexes, key=lambda b: likeness[a][b]) for a in indexes]
	prev = [max(indexes, key=lambda a: likeness[a][b]) for b in indexes]

	# Ignore overlaps in with than 50 hex chars
	for a in indexes:
	if likeness[a][next[a]] < 50:
	next[a] = None
	if likeness[prev[a]][a] < 50:
	prev[a] = None

	# We know a bit more about last and first
	next[last] = None
	prev[first] = None

	# Try to figure out if there is a "perfect" best match, that does not conflict with other matches
	perfect_prev = [True for _ in indexes]
	perfect_next = [True for _ in indexes]

	for a in indexes:
	if next[a] == None:
	perfect_next[a] = False
	elif prev[next[a]] != a:
	perfect_next[a] = False
	perfect_prev[next[a]] = False

	if prev[a] == None:
	perfect_prev[a] = False
	elif next[prev[a]] != a:
	perfect_prev[a] = False
	perfect_next[prev[a]] = False

	# If it was not a perfect match, remove it
	for a in indexes:
	if not (perfect_next[a] and perfect_prev[next[a]]):
	next[a] = None
	if not (perfect_prev[a] and perfect_next[prev[a]]):
	prev[a] = None

	# Now put all the indexes into chains
	chained = [False for _ in indexes]

	FORWARD = 1
	BACKWARDS = 2
	def get_chain(i, direction):
	if i == None:
	return []

	chained[i] = True
	if direction == FORWARD:
	return [i] + get_chain(next[i], direction)
	else:
	return get_chain(prev[i], direction) + [i]

	def get_full_chain(i):
	return get_chain(i, BACKWARDS) + get_chain(next[i], FORWARD)

	# Make sure that the known first chain comes first
	chains = [get_full_chain(first)]

	# And the the known last chain comes last
	if not chained[last]:
	chains.append(get_full_chain(last))

	for i in indexes:
	if not chained[i]:
	chains.insert(1, get_full_chain(i))

	combined_chains = []

	# Loop over the chains and print the result
	for chain in chains:

	# Create the full hex value of the chain
	current = data[chain[0]]
	for (a, b) in zip(chain, chain[1:]):
	shared = likeness[a][b]
	current += data[b][shared:]

	combined_chains.append(current)

	# Combine the chains using a greedy algorithm
	while len(combined_chains) > 1:
	a,b = max([(a,b) for a in range(len(combined_chains)) for b in range(len(combined_chains))], key=lambda (a,b): like(combined_chains[a],combined_chains[b]))
	if like(combined_chains[a], combined_chains[b]) < 8:
	break

	minval = min(a,b)
	maxval = max(a,b)

	adata = combined_chains[a]
	bdata = combined_chains[b]

	combined_chains.pop(maxval)

	combined_chains[minval] = adata + bdata[like(adata, bdata):]

	for chain in combined_chains:
	# Unhex it
	unhexed = unhexlify(chain)

	# base64 only works, when the data is aligned correctly
	# to a 4 byte boundary. It also needs the entire string
	# to be of this length, though this can be fixed by
	# appending "="

	def fix(n):
	try:
	padded1 = "A" * n + unhexed
	padded2 = padded1 + "=" * ((4 - len(padded1) % 4) % 4)
	return b64decode(padded2)[n:]
	except:
	return "\xff" * len(unhexed)

	# Try all possible alignments
	possible = [fix(n) for n in range(4)]

	# And find the one with the least number of printable characters
	possible.sort(key = lambda s: sum(c not in string.printable for c in s))

	print ""
	print chain
	print "Stream 1: " + possible[0][::2]
	print "Stream 2: " + possible[0][1::2]