Skip to content

Instantly share code, notes, and snippets.

@jnpn
Created July 1, 2019 19:00
Show Gist options
  • Save jnpn/3974dc9df0dea7a27a98cb9b44375083 to your computer and use it in GitHub Desktop.
Save jnpn/3974dc9df0dea7a27a98cb9b44375083 to your computer and use it in GitHub Desktop.
os.walk . hd5 . groupby
import os
import sys
import hashlib
DIR=os.path.expanduser('~/Downloads')
def md5(f,c=4096):
m = hashlib.md5()
with open(f,'rb') as b:
m.update(b.read(c))
return m.hexdigest()
def hashes(d):
t = {}
i = 0
for p,d,c in os.walk(d):
for f in c:
i += 1
n = os.path.sep.join([p,f])
h = md5(n)
if h in t:
t[h].append(n)
else:
t[h] = [n]
return t,i
# maybe use collections.groupby ?
def dups(t):
return {k:v for k,v in t.items() if len(v) > 1}
def main(d):
t,c = hashes(d)
print('total file(s): ', c)
d = dups(t)
print('duplicate(s) found: ', len(d))
return t,c,d
def test():
return main(DIR)
if __name__ == '__main__':
for d in sys.argv[1:]:
main(d)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment