poolpOrg/chunkifier.py

## chunkifier.py
#
# Copyright (c) 2017 Gilles Chehade <gilles@poolp.org>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
"""
this module provides a method to split a file into variable-size blocks.
"""

import mmap
import math

def endswith_n0bits(value, nbits):
    mask = 0
    for _ in range(0, nbits):
        mask = mask<<1|1
    return (value & mask) == 0

class RollingSum(object):
    def __init__(self, data, block_size=65535):
        self.l_word = 0
        self.h_word = 0
        self.block_size = block_size
        for i in range(0, len(data)):
            self.l_word += data[i]
            self.h_word += (len(data) - i) * data[i]
        self.buffer = data

    def push(self, byte):
        p_byte, n_byte = self.buffer[0], byte[-1]
        self.l_word -= p_byte - n_byte
        self.h_word -= p_byte * self.block_size - self.l_word
        self.buffer = self.buffer[1:] + byte

    def digest(self):
        return (self.h_word << 16) | self.l_word


class Chunkifier(object):

    @staticmethod
    def _dynamic_chunks(mmf, block_size):
        nzeros = int(math.log(block_size))
        rehash = True
        offset = pos = 0

        while True:
            if rehash and pos != len(mmf):
                chksum = RollingSum(mmf[pos:pos+int(block_size/2)])
                yield mmf[offset:min(pos+int(block_size/2), len(mmf))]
                offset = pos = min(pos+int(block_size/2), len(mmf))
                rehash = False

            if pos == len(mmf):
                break

            if endswith_n0bits(chksum.digest(), nzeros):
                rehash = True
                continue

            chksum.push(mmf[pos:pos+1])
            pos += 1

        if offset != len(mmf):
            yield mmf[offset:len(mmf)]

        return None


    @staticmethod
    def dynamic(filename, block_size=65536):
        with open(filename, mode='rb') as fpc:
            mmf = mmap.mmap(fpc.fileno(), 0, mmap.MAP_PRIVATE, mmap.PROT_READ)
            for chunk in Chunkifier._dynamic_chunks(mmf, block_size):
                yield chunk
            mmf.close()
        return None


def pretty_size(nbytes):
    for unit in ['B', 'KB', 'MB', 'GB']:
        if nbytes < 1024.0:
            return "%3.1f%s" % (nbytes, unit)
        nbytes /= 1024.0
    return "%3.1f%s" % (nbytes, 'TB')

if __name__=='__main__':
    import sys

    def benchmark(filename, block_size):
        import time

        print("= benchmark(filename='%s', block_size=%d):" % (filename, block_size))
        timer0 = time.time()
        nbytes = nchunks = 0
        minsz = maxsz = 0
        for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size):
            if not minsz or len(_chunk) < minsz:
                minsz = len(_chunk)
            if not maxsz or len(_chunk) > maxsz:
                maxsz = len(_chunk)
            nchunks += 1
            nbytes += len(_chunk)
        timing = time.time() - timer0
        print("\tchunkified %s into %d chunks (min=%s/avg=%s/max=%s) in %.04f (avg: %s/s)" % (pretty_size(nbytes), nchunks, pretty_size(minsz), pretty_size((minsz + maxsz) / 2), pretty_size(maxsz), timing, pretty_size(nbytes/timing)))
        print()

    def integrity(filename, block_size):
        import hashlib

        print("= integrity(filename='%s', block_size=%d):" % (filename, block_size))
        hasher = hashlib.sha256()
        hasher.update(open(sys.argv[1], 'rb').read())
        x = hasher.hexdigest()

        hasher = hashlib.sha256()
        for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size):
            hasher.update(_chunk)
        y = hasher.hexdigest()
        if x != y:
            print("\tchecksum mismatch:", x, "<>", y)
        else:
            print("\tchecksum ok:", x)

filename = sys.argv[1]
for pwr in range(12, 17):
    benchmark(filename, 2**pwr)

for pwr in range(12, 17):
    integrity(filename, 2**pwr)
	#
	# Copyright (c) 2017 Gilles Chehade <gilles@poolp.org>
	#
	# Permission to use, copy, modify, and distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	#
	"""
	this module provides a method to split a file into variable-size blocks.
	"""

	import mmap
	import math

	def endswith_n0bits(value, nbits):
	mask = 0
	for _ in range(0, nbits):
	mask = mask<<1\|1
	return (value & mask) == 0

	class RollingSum(object):
	def __init__(self, data, block_size=65535):
	self.l_word = 0
	self.h_word = 0
	self.block_size = block_size
	for i in range(0, len(data)):
	self.l_word += data[i]
	self.h_word += (len(data) - i) * data[i]
	self.buffer = data

	def push(self, byte):
	p_byte, n_byte = self.buffer[0], byte[-1]
	self.l_word -= p_byte - n_byte
	self.h_word -= p_byte * self.block_size - self.l_word
	self.buffer = self.buffer[1:] + byte

	def digest(self):
	return (self.h_word << 16) \| self.l_word


	class Chunkifier(object):

	@staticmethod
	def _dynamic_chunks(mmf, block_size):
	nzeros = int(math.log(block_size))
	rehash = True
	offset = pos = 0

	while True:
	if rehash and pos != len(mmf):
	chksum = RollingSum(mmf[pos:pos+int(block_size/2)])
	yield mmf[offset:min(pos+int(block_size/2), len(mmf))]
	offset = pos = min(pos+int(block_size/2), len(mmf))
	rehash = False

	if pos == len(mmf):
	break

	if endswith_n0bits(chksum.digest(), nzeros):
	rehash = True
	continue

	chksum.push(mmf[pos:pos+1])
	pos += 1

	if offset != len(mmf):
	yield mmf[offset:len(mmf)]

	return None


	@staticmethod
	def dynamic(filename, block_size=65536):
	with open(filename, mode='rb') as fpc:
	mmf = mmap.mmap(fpc.fileno(), 0, mmap.MAP_PRIVATE, mmap.PROT_READ)
	for chunk in Chunkifier._dynamic_chunks(mmf, block_size):
	yield chunk
	mmf.close()
	return None


	def pretty_size(nbytes):
	for unit in ['B', 'KB', 'MB', 'GB']:
	if nbytes < 1024.0:
	return "%3.1f%s" % (nbytes, unit)
	nbytes /= 1024.0
	return "%3.1f%s" % (nbytes, 'TB')

	if __name__=='__main__':
	import sys

	def benchmark(filename, block_size):
	import time

	print("= benchmark(filename='%s', block_size=%d):" % (filename, block_size))
	timer0 = time.time()
	nbytes = nchunks = 0
	minsz = maxsz = 0
	for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size):
	if not minsz or len(_chunk) < minsz:
	minsz = len(_chunk)
	if not maxsz or len(_chunk) > maxsz:
	maxsz = len(_chunk)
	nchunks += 1
	nbytes += len(_chunk)
	timing = time.time() - timer0
	print("\tchunkified %s into %d chunks (min=%s/avg=%s/max=%s) in %.04f (avg: %s/s)" % (pretty_size(nbytes), nchunks, pretty_size(minsz), pretty_size((minsz + maxsz) / 2), pretty_size(maxsz), timing, pretty_size(nbytes/timing)))
	print()

	def integrity(filename, block_size):
	import hashlib

	print("= integrity(filename='%s', block_size=%d):" % (filename, block_size))
	hasher = hashlib.sha256()
	hasher.update(open(sys.argv[1], 'rb').read())
	x = hasher.hexdigest()

	hasher = hashlib.sha256()
	for _chunk in Chunkifier.dynamic(filename=filename, block_size=block_size):
	hasher.update(_chunk)
	y = hasher.hexdigest()
	if x != y:
	print("\tchecksum mismatch:", x, "<>", y)
	else:
	print("\tchecksum ok:", x)

	filename = sys.argv[1]
	for pwr in range(12, 17):
	benchmark(filename, 2**pwr)

	for pwr in range(12, 17):
	integrity(filename, 2**pwr)