Skip to content

Instantly share code, notes, and snippets.

@williame
Created November 18, 2010 09:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save williame/704809 to your computer and use it in GitHub Desktop.
Save williame/704809 to your computer and use it in GitHub Desktop.
zip to folder delta calc
to_mem = True # set to False if you want a actual zip file of the delta made (e.g. you want to recompress it with 7z)
import zipfile
import sys, os, cStringIO
if len(sys.argv) != 3:
print "Diffs a zip file against a folder, and computes a file-granularity delta"
print "usage: python diff1.py prev.zip newfolder"
print "e.g. is MRise_1.0.zip is in your current directory, and you have the 1.6 directory tree in a folder called MRise_1.6, then:"
print " python diff1.py MRise_1.0.zip MRise_1.6"
sys.exit(1)
prev_filename = sys.argv[1]
src_folder = sys.argv[2]
# the zip file used to calculate the compressed size of the new content
nbuf = cStringIO.StringIO()
delta_filename = "diff1.tmp.zip"
new = zipfile.ZipFile(nbuf if to_mem else delta_filename,"w",zipfile.ZIP_DEFLATED)
class FInfo:
"""a file on the filesystem"""
def __init__(self,filename,file_size):
self.filename = filename
self.file_size = file_size
self.new = True
self._body = None
self.renamed = []
self.prev = None
self.dup = None
def body(self):
"""lazy loading of the bytes in the file from disk"""
if self._body is None:
self._body = file(os.path.join(src_folder,self.filename)).read()
return self._body
def __repr__(self):
return "%s %d"%(self.filename,self.file_size)
class Rename:
"""records a rename mapping"""
def __init__(self,f,p):
self.f = f
self.p = p
f.renamed.append(p)
f.new = False
f.prev = p
def __repr__(self):
return "(%s -> %s)"%(self.p.filename,self.f.filename)
prev = zipfile.ZipFile(prev_filename,"r")
pinfo = prev.infolist() # a list of all the files in the previous archive
pinfo = filter(lambda p: p.filename[-1] not in "/\\",pinfo) # filter out directories
finfo = {} # all the files in the new folder, indexed by file-name
sinfo = {} # all the files in the new folder, indexed by file size (useful to find renames)
# walk the file system and fill up finfo and sinfo
for (path, dirs, files) in os.walk(src_folder):
for f in files:
fullpath = os.path.join(path,f)
relpath = fullpath[len(src_folder)+1:]
size = os.stat(fullpath).st_size
f = finfo[relpath] = FInfo(relpath,size)
if size in sinfo:
sinfo[size].append(f)
else:
sinfo[size] = [f]
# calc the diff of pinfo vs finfo
deleted = []
renamed = []
changed = []
changed_same_size = []
unchanged = []
for p in pinfo:
if p.filename not in finfo:
if p.file_size in sinfo:
# renamed?
found = False
for s in sinfo[p.file_size]:
pbody = prev.read(p.filename)
sbody = s.body()
if pbody == sbody:
renamed.append(Rename(s,p))
found = True
break
if found:
continue
deleted.append(p)
else:
f = finfo[p.filename]
f.new = False
if p.file_size != f.file_size:
changed.append(p)
else:
pbody = prev.read(p.filename)
sbody = f.body()
if pbody != sbody:
changed.append(f)
changed_same_size.append(p)
print "changed same size:",p
else:
unchanged.append(p)
f.prev = p
# scan for dups in new file system
dups = []
for s in sinfo.values():
# not so many so don't try and be too clever
for i in xrange(len(s)):
if s[i].dup is not None:
continue
for j in xrange(i+1,len(s)):
if s[i].body() == s[j].body():
print "new",s[i].filename,"dups",s[j]
s[j].dup = s[i]
dups.append(s[j])
# for trivia, scan for dups in previous zip
psinfo = {}
for p in pinfo:
if p.file_size not in psinfo:
psinfo[p.file_size] = []
psinfo[p.file_size].append(p)
pdups = []
for s in psinfo.values():
# not so many so don't try and be too clever
if len(s) < 2:
continue
bodies = [prev.read(p.filename) for p in s]
for i in xrange(len(s)):
if bodies[i] is None:
continue
for j in xrange(i+1,len(s)):
if bodies[j] is None:
continue
if bodies[i] == bodies[j]:
print "previous",s[i].filename,"dups",s[j].filename,s[j].compress_size,s[j].file_size
bodies[j] = None
pdups.append(s[j])
# for those that are new or changed, work out the new compressed size
copied = 0
added = []
for f in finfo.values():
if f.new:
added.append(f)
if f.dup is not None:
print "skipping dup",f
elif f.prev is None:
print "compressing",f
new.write(os.path.join(src_folder,f.filename))
else:
copied += f.prev.compress_size
new.close()
delta = len(nbuf.getvalue()) if to_mem else os.stat(delta_filename).st_size
def sz(num):
"""pretty print a file size"""
for x in ['bytes','KB','MB','GB','TB']:
if num < 1024.0:
return "%3.1f %s" % (num, x)
num /= 1024.0
# dump the stats
print "==="
print len(added),"added:",sz(sum([p.file_size for p in added]))
print len(renamed),"renamed:",sz(sum([p.f.file_size for p in renamed]))
print len(unchanged),"unchanged:",sz(sum([p.file_size for p in unchanged])),"->",sz(sum([p.compress_size for p in unchanged]))
print len(changed),"changed:",sz(sum([p.file_size for p in changed])),"(%d,%s)"%(len(changed_same_size),sz(sum([p.file_size for p in changed_same_size])))
print len(deleted),"deleted:",sz(sum([p.file_size for p in deleted])),"->",sz(sum([p.compress_size for p in deleted]))
print len(pinfo),"previously:",sz(sum([p.file_size for p in pinfo])),"->",sz(os.stat(prev_filename).st_size)
print len(finfo),"now:",sz(sum([p.file_size for p in finfo.values()])),"->",sz(copied+delta)
print len(pdups),"dups in previous archive:",sz(sum([p.file_size for p in pdups])),"->",sz(sum([p.compress_size for p in pdups]))
print len(dups),"dups avoided in new archive:",sz(sum([p.file_size for p in dups]))
print "delta:",sz(delta)
print "(all zipping done to memory)" if to_mem else "(delta in %s)"%delta_filename
@williame
Copy link
Author

added a dup scanner

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment