Skip to content

Instantly share code, notes, and snippets.

@jbcrail
Last active February 22, 2018 23:54
Show Gist options
  • Save jbcrail/0419b2ce25c2632b376aa1e40a917216 to your computer and use it in GitHub Desktop.
Save jbcrail/0419b2ce25c2632b376aa1e40a917216 to your computer and use it in GitHub Desktop.
Group duplicate files recursively inside a directory
import hashlib
import os
import sys
from os.path import join, getsize
def sha1(path):
m = hashlib.sha1()
with open(path, "r") as f:
while True:
data = f.read(1024).encode('utf-8')
if len(data) == 0:
break
m.update(data)
return m.hexdigest()
def find_duplicates(path):
# 1) Group all files with same size
fsizes = {}
for root, _, files in os.walk(path):
for name in files:
filename = join(root, name)
fsize = getsize(filename)
if fsize not in fsizes:
fsizes[fsize] = []
fsizes[fsize].append(filename)
# 2) Group all files with same hash
duplicates = {}
for key in fsizes:
if len(fsizes[key]) < 2:
continue
for name in fsizes[key]:
hashid = sha1(name)
if hashid not in duplicates:
duplicates[hashid] = []
duplicates[hashid].append(name)
# 3) Return groups with duplicates
return [matches for matches in duplicates.values() if len(matches) > 1]
if __name__ == '__main__':
print(find_duplicates(sys.argv[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment