Created
April 7, 2012 19:46
-
-
Save Freaken/2331628 to your computer and use it in GitHub Desktop.
Tool to automatically download and combine all the hexes from http://www.reddit.com/r/programming/comments/rxve9/let_us_have_hex/?limit=1000
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from binascii import unhexlify | |
from base64 import b64decode | |
import urllib | |
import re | |
import string | |
import sys | |
# Download the hexes from the reddit thread | |
data = urllib.urlopen("https://raw.github.com/gist/4a65eadfbdc81b5eeb43/3591eb5066e015f6a4dedf7ff3295eab9a3fee09/hexes") | |
data = data.read() | |
data = re.findall(r"\b[0-9a-f]{100}\b", data) | |
data = sorted(list(set(data))) | |
# Remove some bad hexes (posted by myself) | |
data.remove("41674c4455674c4463674c444d674c4441674c4441674c4463674c4459674c4449674c4459674c4459674c4445674e545173") | |
data.remove("41314e797773494341794e537773494341304e697773494341794a79786d494363774c437767494463324b53773749434178") | |
# Finds the indexes of the first and last hex (which is known to be in the thread) | |
try: | |
first = data.index("497a78705032357759326873634855675a43526c595341675044317a494852685a484a70636d39684c6e6c6f4b4434354369") | |
last = data.index("4270494630674f3349675a5831304948566c636d4e75614342764d4341374a79426c494363674f7941674944393950676f3d") | |
except: | |
print "Something went wrong with downloading the hexes from the reddit thread!" | |
sys.exit(-1) | |
# For comparing how much of an overlap there is between two strings | |
def like(a, b): | |
if a == b: return -1 | |
for n in range(min(len(a), len(b)), 0, -2): | |
if a[-n:] == b[:n]: | |
return n | |
return 0 | |
# These are used a lot, defined for convenience (and speed) | |
indexes = range(len(data)) | |
likeness = [[like(data[a],data[b]) for b in indexes] for a in indexes] | |
# Create a tables of (index, likeness) for the best possible next and previous indexes | |
next = [max(indexes, key=lambda b: likeness[a][b]) for a in indexes] | |
prev = [max(indexes, key=lambda a: likeness[a][b]) for b in indexes] | |
# Ignore overlaps in with than 50 hex chars | |
for a in indexes: | |
if likeness[a][next[a]] < 50: | |
next[a] = None | |
if likeness[prev[a]][a] < 50: | |
prev[a] = None | |
# We know a bit more about last and first | |
next[last] = None | |
prev[first] = None | |
# Try to figure out if there is a "perfect" best match, that does not conflict with other matches | |
perfect_prev = [True for _ in indexes] | |
perfect_next = [True for _ in indexes] | |
for a in indexes: | |
if next[a] == None: | |
perfect_next[a] = False | |
elif prev[next[a]] != a: | |
perfect_next[a] = False | |
perfect_prev[next[a]] = False | |
if prev[a] == None: | |
perfect_prev[a] = False | |
elif next[prev[a]] != a: | |
perfect_prev[a] = False | |
perfect_next[prev[a]] = False | |
# If it was not a perfect match, remove it | |
for a in indexes: | |
if not (perfect_next[a] and perfect_prev[next[a]]): | |
next[a] = None | |
if not (perfect_prev[a] and perfect_next[prev[a]]): | |
prev[a] = None | |
# Now put all the indexes into chains | |
chained = [False for _ in indexes] | |
FORWARD = 1 | |
BACKWARDS = 2 | |
def get_chain(i, direction): | |
if i == None: | |
return [] | |
chained[i] = True | |
if direction == FORWARD: | |
return [i] + get_chain(next[i], direction) | |
else: | |
return get_chain(prev[i], direction) + [i] | |
def get_full_chain(i): | |
return get_chain(i, BACKWARDS) + get_chain(next[i], FORWARD) | |
# Make sure that the known first chain comes first | |
chains = [get_full_chain(first)] | |
# And the the known last chain comes last | |
if not chained[last]: | |
chains.append(get_full_chain(last)) | |
for i in indexes: | |
if not chained[i]: | |
chains.insert(1, get_full_chain(i)) | |
combined_chains = [] | |
# Loop over the chains and print the result | |
for chain in chains: | |
# Create the full hex value of the chain | |
current = data[chain[0]] | |
for (a, b) in zip(chain, chain[1:]): | |
shared = likeness[a][b] | |
current += data[b][shared:] | |
combined_chains.append(current) | |
# Combine the chains using a greedy algorithm | |
while len(combined_chains) > 1: | |
a,b = max([(a,b) for a in range(len(combined_chains)) for b in range(len(combined_chains))], key=lambda (a,b): like(combined_chains[a],combined_chains[b])) | |
if like(combined_chains[a], combined_chains[b]) < 8: | |
break | |
minval = min(a,b) | |
maxval = max(a,b) | |
adata = combined_chains[a] | |
bdata = combined_chains[b] | |
combined_chains.pop(maxval) | |
combined_chains[minval] = adata + bdata[like(adata, bdata):] | |
for chain in combined_chains: | |
# Unhex it | |
unhexed = unhexlify(chain) | |
# base64 only works, when the data is aligned correctly | |
# to a 4 byte boundary. It also needs the entire string | |
# to be of this length, though this can be fixed by | |
# appending "=" | |
def fix(n): | |
try: | |
padded1 = "A" * n + unhexed | |
padded2 = padded1 + "=" * ((4 - len(padded1) % 4) % 4) | |
return b64decode(padded2)[n:] | |
except: | |
return "\xff" * len(unhexed) | |
# Try all possible alignments | |
possible = [fix(n) for n in range(4)] | |
# And find the one with the least number of printable characters | |
possible.sort(key = lambda s: sum(c not in string.printable for c in s)) | |
print "" | |
print chain | |
print "Stream 1: " + possible[0][::2] | |
print "Stream 2: " + possible[0][1::2] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment