Create a gist now

Instantly share code, notes, and snippets.

@datagrok /dedupe.py
Last active Apr 16, 2016

What would you like to do?
Experiment to see how much a blockwise deduplicating filesystem would save.
#!/usr/bin/python3
"""
"If I had a filesystem that did blockwise deduplication with variable
block sizes (using Rabin-Karp rolling hash), how much space would I
save?"
Recursively reads all files under the current directory, printing
filenames as it goes, along with occasional cumulative statistics.
Currently very naive; the database of hash values for deduplication is
stored only in memory as a python set().
To do:
- explore tunable parameters. try to find rationale behind values,
eliminate if possible.
- does any variety of data cause breakdown in our rolling hash
algorithm? all-ascii bytes? all-zero bytes?
- chunks smaller than the hash digest will certainly waste space.
enforce a minimum chunk size? define a "literal data" flag?
- export hash database to file
- report space/bandwidth saved if run against a server using
provided hash database
- cronjob to apply above report to my nightly backup script
- add content-aware features to block splitter: identify where files
start/end in tarballs and other archives. uncompress data before
analysis. Use binwalk!
- collection of examples for testing: images of varying formats with
various operations/edits applied.
- parallelize processing across all cores
- decouple the ui and status reporting from the main algorithm
- don't keep the full list of sizes around, just the sum, min, max,
etc.
"""
import sys
import os
import sqlite3
from collections import deque
# Tuneable parameters
from hashlib import sha1 as hashfunc
#from hashlib import sha256 as hashfunc
#from hashlib import md5 as hashfunc
hashfuncdigestbytes = hashfunc().digest_size
#exponent = 69069
#rhashwindowlen = 2048
exponent = 7
# small = better against near-duplicate files with random insertion changes?
rhashwindowlen = 8
blocksizetargetmask = 2**12-1 # 2**k-1
class RollingHash(object):
def __init__(self):
self.buf = deque([], maxlen=rhashwindowlen)
self.hashdigest = 0
def update(self, bs):
for b in bs:
if len(self.buf) == self.buf.maxlen:
self.hashdigest = self.hashdigest - pow(self.buf.popleft(), self.buf.maxlen-1, sys.maxsize)
self.hashdigest = (self.hashdigest * exponent + b) % sys.maxsize
def digest(self):
return self.hashdigest
def hashify(fh):
"""Generate tuples of the form (len, hash) by breaking iterable of
bytes bs into chunks.
"""
counter = 0
h = hashfunc()
rh = RollingHash()
for byte in (b for c in iter((lambda: fh.read(4096)), b'') for b in c):
counter += 1
rh.update(bytes([byte]))
h.update(bytes([byte]))
# don't make a chunk if the size would be less than the
# hash
if (counter > hashfuncdigestbytes * 8
and rh.digest() & blocksizetargetmask == 0):
yield(h.digest(), counter)
counter = 0
h = hashfunc()
yield(h.digest(), counter)
class Stats(object):
def __init__(self, hashes):
self.minsize = sys.maxsize
self.maxsize = 0
self.totsize = 0
self.dedupesize = 0
self.hashes = hashes
self.counter = 0
def monitor(self, iterable):
for h, s in iterable:
self.counter += 1
if s > self.maxsize:
self.maxsize = s
if s < self.minsize:
self.minsize = s
self.totsize += s
if (self.counter & 1023) == 0:
self.printstats()
yield h, s
self.printstats()
def printstats(self):
dedupesize = sum(s for h, s in self.hashes)
overhead = len(self.hashes) * hashfuncdigestbytes
print("%7.2fM savings, %3d%% of original size. chunk sizes: %3db %3.1fk %6.1fk" % (
(self.totsize - dedupesize - overhead)/1024/1024,
(dedupesize + overhead) * 100 / self.totsize,
self.minsize,
self.totsize / self.counter / 1024,
self.maxsize / 1024,
))
def main():
hashes = set()
with sqlite3.connect('/home/mike/tmp/test.sqlite3') as con:
con.execute('''create table if not exists hashes (hash blob primary key, len integer) without rowid;''')
hashes |= set(con.execute('select * from hashes;'))
stats = Stats(hashes)
for root, dirs, files in os.walk('.'):
for f in files:
fn = os.path.join(root, f)
if not os.path.isfile(fn):
continue
with open(fn, 'rb') as fh:
print(os.path.join(root, fn))
for h, s in stats.monitor(hashify(fh)):
hashes.add((h,s))
with sqlite3.connect('/home/mike/tmp/test.sqlite3') as con:
con.executemany('insert or ignore into hashes values (?,?);', hashes)
if __name__=='__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment