Skip to content

Instantly share code, notes, and snippets.

@xyb
Created February 28, 2020 06:03
Show Gist options
  • Save xyb/6da3fff9b8f129fe4d9d15d1921a195f to your computer and use it in GitHub Desktop.
Save xyb/6da3fff9b8f129fe4d9d15d1921a195f to your computer and use it in GitHub Desktop.
A tool help to compare huge files by splitting them into small content-based chunks.
#!/usr/bin/env python3
# Usage: python3 cdctools.py <data_file> [<chunk_size>]
# Usage: MORE_DETAILS=1 python3 cdctools.py <data_file> [<chunk_size>]
import os
import sys
import fastchunking
READ_BUFFER_SIZE = 1024 * 4
WINDOW_SIZE = 48
SEED = 0
chunk_size = 1024 * 2 # based your file size and how many pieces do you want
if len(sys.argv) > 2:
chunk_size = int(sys.argv[2])
filename = sys.argv[1]
f = open(filename, 'rb')
boundaries = []
cdc = fastchunking.RabinKarpCDC(window_size=WINDOW_SIZE, seed=SEED)
chunker = cdc.create_chunker(chunk_size=chunk_size)
more_detail = os.getenv('DETAILS', '')
if more_detail:
print('{:^8} {:^8} {:8}'.format('chunk', 'start', 'end'))
print('{:^8} {:^8} {:^8}'.format('='*8, '='*8, '='*8))
filesize = 0
left = right = 0
while True:
content = f.read(READ_BUFFER_SIZE)
if not content:
break
bs = [filesize + b for b in chunker.next_chunk_boundaries(content)]
boundaries += bs
filesize += len(content)
for right in bs:
chunk_size = right - left
if more_detail:
print('{:8} {:8} {:8}'.format(chunk_size, left, right))
else:
print(chunk_size)
left = right
if more_detail:
print('{:8} {:8} {:8}'.format(filesize - left, left, filesize))
else:
print(filesize - left)
if more_detail:
print('total size:', filesize)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment