Created
August 31, 2017 06:54
-
-
Save poolpOrg/d1c239ec8b543215b82c9e7b18b09ae1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Copyright (c) 2017 Gilles Chehade <gilles@poolp.org> | |
# | |
# Permission to use, copy, modify, and distribute this software for any | |
# purpose with or without fee is hereby granted, provided that the above | |
# copyright notice and this permission notice appear in all copies. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
# | |
""" | |
this module provides a method to split a file into variable-size blocks. | |
""" | |
import mmap | |
import math | |
def endswith_n0bits(value, nbits): | |
mask = 0 | |
for _ in range(0, nbits): | |
mask = mask<<1|1 | |
return (value & mask) == 0 | |
class RollingSum(object): | |
def __init__(self, data, block_size=65535): | |
self.l_word = 0 | |
self.h_word = 0 | |
self.block_size = block_size | |
for i in range(0, len(data)): | |
self.l_word += data[i] | |
self.h_word += (len(data) - i) * data[i] | |
self.buffer = data | |
def push(self, byte): | |
p_byte, n_byte = self.buffer[0], byte[-1] | |
self.l_word -= p_byte - n_byte | |
self.h_word -= p_byte * self.block_size - self.l_word | |
self.buffer = self.buffer[1:] + byte | |
def digest(self): | |
return (self.h_word << 16) | self.l_word | |
class Chunkifier(object): | |
@staticmethod | |
def _dynamic_chunks(mmf, block_size): | |
nzeros = int(math.log(block_size)) | |
rehash = True | |
offset = pos = 0 | |
while True: | |
if rehash and pos != len(mmf): | |
chksum = RollingSum(mmf[pos:pos+int(block_size/2)]) | |
yield mmf[offset:min(pos+int(block_size/2), len(mmf))] | |
offset = pos = min(pos+int(block_size/2), len(mmf)) | |
rehash = False | |
if pos == len(mmf): | |
break | |
if endswith_n0bits(chksum.digest(), nzeros): | |
rehash = True | |
continue | |
chksum.push(mmf[pos:pos+1]) | |
pos += 1 | |
if offset != len(mmf): | |
yield mmf[offset:len(mmf)] | |
return None | |
@staticmethod | |
def dynamic(filename, block_size=65536): | |
with open(filename, mode='rb') as fpc: | |
mmf = mmap.mmap(fpc.fileno(), 0, mmap.MAP_PRIVATE, mmap.PROT_READ) | |
for chunk in Chunkifier._dynamic_chunks(mmf, block_size): | |
yield chunk | |
mmf.close() | |
return None | |
def pretty_size(nbytes): | |
for unit in ['B', 'KB', 'MB', 'GB']: | |
if nbytes < 1024.0: | |
return "%3.1f%s" % (nbytes, unit) | |
nbytes /= 1024.0 | |
return "%3.1f%s" % (nbytes, 'TB') | |
if __name__=='__main__': | |
import sys | |
def benchmark(filename, block_size): | |
import time | |
print("= benchmark(filename='%s', block_size=%d):" % (filename, block_size)) | |
timer0 = time.time() | |
nbytes = nchunks = 0 | |
minsz = maxsz = 0 | |
for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size): | |
if not minsz or len(_chunk) < minsz: | |
minsz = len(_chunk) | |
if not maxsz or len(_chunk) > maxsz: | |
maxsz = len(_chunk) | |
nchunks += 1 | |
nbytes += len(_chunk) | |
timing = time.time() - timer0 | |
print("\tchunkified %s into %d chunks (min=%s/avg=%s/max=%s) in %.04f (avg: %s/s)" % (pretty_size(nbytes), nchunks, pretty_size(minsz), pretty_size((minsz + maxsz) / 2), pretty_size(maxsz), timing, pretty_size(nbytes/timing))) | |
print() | |
def integrity(filename, block_size): | |
import hashlib | |
print("= integrity(filename='%s', block_size=%d):" % (filename, block_size)) | |
hasher = hashlib.sha256() | |
hasher.update(open(sys.argv[1], 'rb').read()) | |
x = hasher.hexdigest() | |
hasher = hashlib.sha256() | |
for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size): | |
hasher.update(_chunk) | |
y = hasher.hexdigest() | |
if x != y: | |
print("\tchecksum mismatch:", x, "<>", y) | |
else: | |
print("\tchecksum ok:", x) | |
filename = sys.argv[1] | |
for pwr in range(12, 17): | |
benchmark(filename, 2**pwr) | |
for pwr in range(12, 17): | |
integrity(filename, 2**pwr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment