Skip to content

Instantly share code, notes, and snippets.

@poolpOrg
Created August 31, 2017 06:54
Show Gist options
  • Save poolpOrg/d1c239ec8b543215b82c9e7b18b09ae1 to your computer and use it in GitHub Desktop.
Save poolpOrg/d1c239ec8b543215b82c9e7b18b09ae1 to your computer and use it in GitHub Desktop.
#
# Copyright (c) 2017 Gilles Chehade <gilles@poolp.org>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
"""
this module provides a method to split a file into variable-size blocks.
"""
import mmap
import math
def endswith_n0bits(value, nbits):
mask = 0
for _ in range(0, nbits):
mask = mask<<1|1
return (value & mask) == 0
class RollingSum(object):
def __init__(self, data, block_size=65535):
self.l_word = 0
self.h_word = 0
self.block_size = block_size
for i in range(0, len(data)):
self.l_word += data[i]
self.h_word += (len(data) - i) * data[i]
self.buffer = data
def push(self, byte):
p_byte, n_byte = self.buffer[0], byte[-1]
self.l_word -= p_byte - n_byte
self.h_word -= p_byte * self.block_size - self.l_word
self.buffer = self.buffer[1:] + byte
def digest(self):
return (self.h_word << 16) | self.l_word
class Chunkifier(object):
@staticmethod
def _dynamic_chunks(mmf, block_size):
nzeros = int(math.log(block_size))
rehash = True
offset = pos = 0
while True:
if rehash and pos != len(mmf):
chksum = RollingSum(mmf[pos:pos+int(block_size/2)])
yield mmf[offset:min(pos+int(block_size/2), len(mmf))]
offset = pos = min(pos+int(block_size/2), len(mmf))
rehash = False
if pos == len(mmf):
break
if endswith_n0bits(chksum.digest(), nzeros):
rehash = True
continue
chksum.push(mmf[pos:pos+1])
pos += 1
if offset != len(mmf):
yield mmf[offset:len(mmf)]
return None
@staticmethod
def dynamic(filename, block_size=65536):
with open(filename, mode='rb') as fpc:
mmf = mmap.mmap(fpc.fileno(), 0, mmap.MAP_PRIVATE, mmap.PROT_READ)
for chunk in Chunkifier._dynamic_chunks(mmf, block_size):
yield chunk
mmf.close()
return None
def pretty_size(nbytes):
for unit in ['B', 'KB', 'MB', 'GB']:
if nbytes < 1024.0:
return "%3.1f%s" % (nbytes, unit)
nbytes /= 1024.0
return "%3.1f%s" % (nbytes, 'TB')
if __name__=='__main__':
import sys
def benchmark(filename, block_size):
import time
print("= benchmark(filename='%s', block_size=%d):" % (filename, block_size))
timer0 = time.time()
nbytes = nchunks = 0
minsz = maxsz = 0
for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size):
if not minsz or len(_chunk) < minsz:
minsz = len(_chunk)
if not maxsz or len(_chunk) > maxsz:
maxsz = len(_chunk)
nchunks += 1
nbytes += len(_chunk)
timing = time.time() - timer0
print("\tchunkified %s into %d chunks (min=%s/avg=%s/max=%s) in %.04f (avg: %s/s)" % (pretty_size(nbytes), nchunks, pretty_size(minsz), pretty_size((minsz + maxsz) / 2), pretty_size(maxsz), timing, pretty_size(nbytes/timing)))
print()
def integrity(filename, block_size):
import hashlib
print("= integrity(filename='%s', block_size=%d):" % (filename, block_size))
hasher = hashlib.sha256()
hasher.update(open(sys.argv[1], 'rb').read())
x = hasher.hexdigest()
hasher = hashlib.sha256()
for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size):
hasher.update(_chunk)
y = hasher.hexdigest()
if x != y:
print("\tchecksum mismatch:", x, "<>", y)
else:
print("\tchecksum ok:", x)
filename = sys.argv[1]
for pwr in range(12, 17):
benchmark(filename, 2**pwr)
for pwr in range(12, 17):
integrity(filename, 2**pwr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment