public
Created

Tool to automatically download and combine all the hexes from http://www.reddit.com/r/programming/comments/rxve9/let_us_have_hex/?limit=1000

  • Download Gist
join_hexes.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
#!/usr/bin/python
 
from binascii import unhexlify
from base64 import b64decode
import urllib
import re
import string
import sys
 
# Download the hexes from the reddit thread
data = urllib.urlopen("https://raw.github.com/gist/4a65eadfbdc81b5eeb43/3591eb5066e015f6a4dedf7ff3295eab9a3fee09/hexes")
data = data.read()
data = re.findall(r"\b[0-9a-f]{100}\b", data)
data = sorted(list(set(data)))
 
# Remove some bad hexes (posted by myself)
data.remove("41674c4455674c4463674c444d674c4441674c4441674c4463674c4459674c4449674c4459674c4459674c4445674e545173")
data.remove("41314e797773494341794e537773494341304e697773494341794a79786d494363774c437767494463324b53773749434178")
 
# Finds the indexes of the first and last hex (which is known to be in the thread)
try:
first = data.index("497a78705032357759326873634855675a43526c595341675044317a494852685a484a70636d39684c6e6c6f4b4434354369")
last = data.index("4270494630674f3349675a5831304948566c636d4e75614342764d4341374a79426c494363674f7941674944393950676f3d")
except:
print "Something went wrong with downloading the hexes from the reddit thread!"
sys.exit(-1)
 
# For comparing how much of an overlap there is between two strings
def like(a, b):
if a == b: return -1
 
for n in range(min(len(a), len(b)), 0, -2):
if a[-n:] == b[:n]:
return n
return 0
 
# These are used a lot, defined for convenience (and speed)
indexes = range(len(data))
likeness = [[like(data[a],data[b]) for b in indexes] for a in indexes]
 
# Create a tables of (index, likeness) for the best possible next and previous indexes
next = [max(indexes, key=lambda b: likeness[a][b]) for a in indexes]
prev = [max(indexes, key=lambda a: likeness[a][b]) for b in indexes]
 
# Ignore overlaps in with than 50 hex chars
for a in indexes:
if likeness[a][next[a]] < 50:
next[a] = None
if likeness[prev[a]][a] < 50:
prev[a] = None
 
# We know a bit more about last and first
next[last] = None
prev[first] = None
 
# Try to figure out if there is a "perfect" best match, that does not conflict with other matches
perfect_prev = [True for _ in indexes]
perfect_next = [True for _ in indexes]
 
for a in indexes:
if next[a] == None:
perfect_next[a] = False
elif prev[next[a]] != a:
perfect_next[a] = False
perfect_prev[next[a]] = False
 
if prev[a] == None:
perfect_prev[a] = False
elif next[prev[a]] != a:
perfect_prev[a] = False
perfect_next[prev[a]] = False
 
# If it was not a perfect match, remove it
for a in indexes:
if not (perfect_next[a] and perfect_prev[next[a]]):
next[a] = None
if not (perfect_prev[a] and perfect_next[prev[a]]):
prev[a] = None
 
# Now put all the indexes into chains
chained = [False for _ in indexes]
 
FORWARD = 1
BACKWARDS = 2
def get_chain(i, direction):
if i == None:
return []
chained[i] = True
if direction == FORWARD:
return [i] + get_chain(next[i], direction)
else:
return get_chain(prev[i], direction) + [i]
 
def get_full_chain(i):
return get_chain(i, BACKWARDS) + get_chain(next[i], FORWARD)
 
# Make sure that the known first chain comes first
chains = [get_full_chain(first)]
 
# And the the known last chain comes last
if not chained[last]:
chains.append(get_full_chain(last))
 
for i in indexes:
if not chained[i]:
chains.insert(1, get_full_chain(i))
 
combined_chains = []
 
# Loop over the chains and print the result
for chain in chains:
 
# Create the full hex value of the chain
current = data[chain[0]]
for (a, b) in zip(chain, chain[1:]):
shared = likeness[a][b]
current += data[b][shared:]
 
combined_chains.append(current)
 
# Combine the chains using a greedy algorithm
while len(combined_chains) > 1:
a,b = max([(a,b) for a in range(len(combined_chains)) for b in range(len(combined_chains))], key=lambda (a,b): like(combined_chains[a],combined_chains[b]))
if like(combined_chains[a], combined_chains[b]) < 8:
break
 
minval = min(a,b)
maxval = max(a,b)
 
adata = combined_chains[a]
bdata = combined_chains[b]
 
combined_chains.pop(maxval)
 
combined_chains[minval] = adata + bdata[like(adata, bdata):]
 
for chain in combined_chains:
# Unhex it
unhexed = unhexlify(chain)
 
# base64 only works, when the data is aligned correctly
# to a 4 byte boundary. It also needs the entire string
# to be of this length, though this can be fixed by
# appending "="
 
def fix(n):
try:
padded1 = "A" * n + unhexed
padded2 = padded1 + "=" * ((4 - len(padded1) % 4) % 4)
return b64decode(padded2)[n:]
except:
return "\xff" * len(unhexed)
 
# Try all possible alignments
possible = [fix(n) for n in range(4)]
 
# And find the one with the least number of printable characters
possible.sort(key = lambda s: sum(c not in string.printable for c in s))
 
print ""
print chain
print "Stream 1: " + possible[0][::2]
print "Stream 2: " + possible[0][1::2]

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.