Skip to content

Instantly share code, notes, and snippets.

@jinie
Last active May 12, 2019 07:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jinie/b51f75fa1ece7c02ca3f to your computer and use it in GitHub Desktop.
Save jinie/b51f75fa1ece7c02ca3f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import logging
import hashlib
def prune(dic):
return {key:value for key, value in dic.iteritems() if len(value) > 1}
def scan(path):
ret = {}
for root, dirs, files in os.walk(path):
for name in files:
fname = os.path.join(root, name)
size = os.stat(fname).st_size
if size not in ret.keys():
ret[size] = []
ret[size].append(fname)
return prune(ret)
def checksum(fileDict, read_full=False, blocksize=1024):
ret = {}
for key,value in fileDict.iteritems():
for fname in value:
bcount = 1 if read_full is False else (os.stat(fname).st_size / blocksize) + (os.stat(fname).st_size % blocksize)
m = hashlib.sha1()
i = 0
with open(fname,"rb") as f:
b = f.read(blocksize)
m.update(b)
i+=1
if i >= bcount:
f.seek(0,2)
digest = m.hexdigest()
if digest not in ret:
ret[digest] = []
logging.debug("{0} => {1}".format(fname,digest))
ret[digest].append(fname)
return prune(ret)
def tally_files(fileDict):
ret = 0
for key,value in fileDict.iteritems():
ret += len(value)
return ret
def tally_wasted_space(fileDict):
ret = 0
for key,value in fileDict.iteritems():
ret += os.stat(value[0]).st_size * len(value)-1
return ret
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.info("Scanning")
f = scan("/volume1")
logging.info("Quick scanning {0} files".format(tally_files(f)))
f = checksum(f)
logging.info("Slow scanning {0} files".format(tally_files(f)))
f = checksum(f,True)
logging.info("Found {0} duplicate files, total wasted space {1}".format(tally_files(f), tally_wasted_space(f)))
for key,value in f.iteritems():
out = ""
for f in value:
out+="{0},".format(f)
print(out[:-1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment