Skip to content

Instantly share code, notes, and snippets.

@seece
Created March 9, 2015 20:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seece/fbeb46b1d068e5dbd48d to your computer and use it in GitHub Desktop.
Save seece/fbeb46b1d068e5dbd48d to your computer and use it in GitHub Desktop.
File byte distribution comparison
"""
compares the byte value distribution of the first argument with all the files in victims/
"""
from distr import calc_distribution
import os
import pickle
import sys
def dot(d1, d2):
return sum([a*b for a,b in zip(d1, d2)])
distr = {}
datafile = "data.filee"
if os.path.isfile(datafile):
with open(datafile, "rb") as f:
distr = pickle.load(f)
print "loaded distr from " + datafile
victimpath = "victims"
for path in os.listdir(victimpath):
if path in distr:
continue
print path + ""
distr[path] = calc_distribution(victimpath + "/" + path)
with open(datafile, "wb") as f:
pickle.dump(distr, f)
mystery = calc_distribution(sys.argv[1])
scores = [(name, dot(d, mystery)) for name, d in distr.iteritems()]
scores = sorted(scores, key=lambda x: -x[1])
for score in scores[0:15]:
print "{:.<30}{:0.2f}".format(score[0], score[1])
import sys
def calc_distribution(path):
hits = [0]*256
with open(path, "rb") as f:
byte = f.read(1)
while byte != "":
# Do stuff with byte.
byte = f.read(1)
if byte == "":
break
hits[ord(byte[0])] += 1
total = sum(hits)
return [x/float(total) for x in hits]
if __name__ == "__main__":
path = sys.argv[1]
d = calc_distribution(path)
print(str(d))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment