Skip to content

Instantly share code, notes, and snippets.

@lahwran
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lahwran/fb0ac88bba626792c666 to your computer and use it in GitHub Desktop.
Save lahwran/fb0ac88bba626792c666 to your computer and use it in GitHub Desktop.
# these two functions from github.com/lahwran/dotfiles:bin/summarize
import time
import hashlib
def iterfile(f, bufsize=None):
if not bufsize:
bufsize = 8192
while True:
buff = f.read(bufsize)
if not buff:
break
yield buff
def hashfile(filename, bufsize=None):
with open(filename, "rb") as f:
sha = hashlib.sha256()
progress = 0
lastprogress = 0
for chunk in iterfile(f, bufsize):
progress += len(chunk)
if time.time() - lastprogress > 1:
print "%0.2f%%" % ((float(progress) / float(statsize)) * 100)
lastprogress = time.time()
sha.update(chunk)
return sha.digest()
sizes = ((filename, os.stat(filename).st_size) for filename in <filenames here>)
size_lists = {}
for filename, size in sizes:
size_lists.setdefault(size, []).append(filename)
possible_dupes = ((size, filenames) for size, filenames in size_lists.iteritems()
if len(filenames) > 1)
dupes = {}
for size, filenames in possible_dupes:
for filename in filenames:
hash = hashfile(filename)
dupes.setdefault(hash, []).append(filename)
for hash, filenames in dupes.iteritems():
print "have hash", hash
for filename in filenames:
print "\t", filename
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment