Skip to content

Instantly share code, notes, and snippets.

@Freaken
Created April 7, 2012 19:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Freaken/2331628 to your computer and use it in GitHub Desktop.
Save Freaken/2331628 to your computer and use it in GitHub Desktop.
Tool to automatically download and combine all the hexes from http://www.reddit.com/r/programming/comments/rxve9/let_us_have_hex/?limit=1000
#!/usr/bin/python
from binascii import unhexlify
from base64 import b64decode
import urllib
import re
import string
import sys
# Download the hexes from the reddit thread
data = urllib.urlopen("https://raw.github.com/gist/4a65eadfbdc81b5eeb43/3591eb5066e015f6a4dedf7ff3295eab9a3fee09/hexes")
data = data.read()
data = re.findall(r"\b[0-9a-f]{100}\b", data)
data = sorted(list(set(data)))
# Remove some bad hexes (posted by myself)
data.remove("41674c4455674c4463674c444d674c4441674c4441674c4463674c4459674c4449674c4459674c4459674c4445674e545173")
data.remove("41314e797773494341794e537773494341304e697773494341794a79786d494363774c437767494463324b53773749434178")
# Finds the indexes of the first and last hex (which is known to be in the thread)
try:
first = data.index("497a78705032357759326873634855675a43526c595341675044317a494852685a484a70636d39684c6e6c6f4b4434354369")
last = data.index("4270494630674f3349675a5831304948566c636d4e75614342764d4341374a79426c494363674f7941674944393950676f3d")
except:
print "Something went wrong with downloading the hexes from the reddit thread!"
sys.exit(-1)
# For comparing how much of an overlap there is between two strings
def like(a, b):
if a == b: return -1
for n in range(min(len(a), len(b)), 0, -2):
if a[-n:] == b[:n]:
return n
return 0
# These are used a lot, defined for convenience (and speed)
indexes = range(len(data))
likeness = [[like(data[a],data[b]) for b in indexes] for a in indexes]
# Create a tables of (index, likeness) for the best possible next and previous indexes
next = [max(indexes, key=lambda b: likeness[a][b]) for a in indexes]
prev = [max(indexes, key=lambda a: likeness[a][b]) for b in indexes]
# Ignore overlaps in with than 50 hex chars
for a in indexes:
if likeness[a][next[a]] < 50:
next[a] = None
if likeness[prev[a]][a] < 50:
prev[a] = None
# We know a bit more about last and first
next[last] = None
prev[first] = None
# Try to figure out if there is a "perfect" best match, that does not conflict with other matches
perfect_prev = [True for _ in indexes]
perfect_next = [True for _ in indexes]
for a in indexes:
if next[a] == None:
perfect_next[a] = False
elif prev[next[a]] != a:
perfect_next[a] = False
perfect_prev[next[a]] = False
if prev[a] == None:
perfect_prev[a] = False
elif next[prev[a]] != a:
perfect_prev[a] = False
perfect_next[prev[a]] = False
# If it was not a perfect match, remove it
for a in indexes:
if not (perfect_next[a] and perfect_prev[next[a]]):
next[a] = None
if not (perfect_prev[a] and perfect_next[prev[a]]):
prev[a] = None
# Now put all the indexes into chains
chained = [False for _ in indexes]
FORWARD = 1
BACKWARDS = 2
def get_chain(i, direction):
if i == None:
return []
chained[i] = True
if direction == FORWARD:
return [i] + get_chain(next[i], direction)
else:
return get_chain(prev[i], direction) + [i]
def get_full_chain(i):
return get_chain(i, BACKWARDS) + get_chain(next[i], FORWARD)
# Make sure that the known first chain comes first
chains = [get_full_chain(first)]
# And the the known last chain comes last
if not chained[last]:
chains.append(get_full_chain(last))
for i in indexes:
if not chained[i]:
chains.insert(1, get_full_chain(i))
combined_chains = []
# Loop over the chains and print the result
for chain in chains:
# Create the full hex value of the chain
current = data[chain[0]]
for (a, b) in zip(chain, chain[1:]):
shared = likeness[a][b]
current += data[b][shared:]
combined_chains.append(current)
# Combine the chains using a greedy algorithm
while len(combined_chains) > 1:
a,b = max([(a,b) for a in range(len(combined_chains)) for b in range(len(combined_chains))], key=lambda (a,b): like(combined_chains[a],combined_chains[b]))
if like(combined_chains[a], combined_chains[b]) < 8:
break
minval = min(a,b)
maxval = max(a,b)
adata = combined_chains[a]
bdata = combined_chains[b]
combined_chains.pop(maxval)
combined_chains[minval] = adata + bdata[like(adata, bdata):]
for chain in combined_chains:
# Unhex it
unhexed = unhexlify(chain)
# base64 only works, when the data is aligned correctly
# to a 4 byte boundary. It also needs the entire string
# to be of this length, though this can be fixed by
# appending "="
def fix(n):
try:
padded1 = "A" * n + unhexed
padded2 = padded1 + "=" * ((4 - len(padded1) % 4) % 4)
return b64decode(padded2)[n:]
except:
return "\xff" * len(unhexed)
# Try all possible alignments
possible = [fix(n) for n in range(4)]
# And find the one with the least number of printable characters
possible.sort(key = lambda s: sum(c not in string.printable for c in s))
print ""
print chain
print "Stream 1: " + possible[0][::2]
print "Stream 2: " + possible[0][1::2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment