Skip to content

Instantly share code, notes, and snippets.

@mcowger
Created April 17, 2014 20:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mcowger/11008369 to your computer and use it in GitHub Desktop.
Save mcowger/11008369 to your computer and use it in GitHub Desktop.
__author__ = 'mcowger'
import hashlib
import sys
from progress.bar import Bar
import os
import timeit
import random
import logging
BLOCK_SIZE = 4096
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
filename = "/Volumes/DataDisk/mcowger/Downloads/webexmc_intel.dmg"
def create_block_eval_map(filename,sample_size=100):
filesize_bytes = os.path.getsize(filename)
block_count = filesize_bytes / BLOCK_SIZE
blocks_to_analyze = sorted(
random.sample(
range(0, block_count),
block_count * sample_size / 100
)
)
logger.info("Produced block map of size %s from possible size of %s" % (len(blocks_to_analyze),block_count))
return blocks_to_analyze
def shamethod(percent_of_file):
fh = open(filename, 'rb')
blocks_in_file = filesize_bytes / BLOCK_SIZE
blockcount_to_process = int(blocks_in_file * (percent_of_file / 100.0))
blockmap = sorted(random.sample(range(0, blocks_in_file), blockcount_to_process))
prevsize = 0
blocks_processed = 0
sha256_hashes = {}
bar = Bar('Processing File (SHA 256)', suffix='ETA: %(eta_td)s', max=len(blockmap))
for blocklocation in blockmap:
fh.seek(blocklocation * BLOCK_SIZE)
chunk = fh.read(BLOCK_SIZE)
blocks_processed += 1
if chunk:
s = hashlib.sha256()
s.update(chunk)
sha256_hashes[s.hexdigest()] = sha256_hashes.get(s.hexdigest(),0) + 1
if sys.getsizeof(sha256_hashes) / 1024.0 /1024.0 != prevsize:
#print "\nSize of Hash Table: %s\n" % prevsize
prevsize = sys.getsizeof(sha256_hashes) / 1024.0 / 1024.0
#if blocks_processed % 256 == 0:
#print "MB Processed: %s" % (blocks_processed / 256)
bar.next()
bar.finish()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment