Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
find duplicate files in a git repo
# Print filenames that are duplicated at least once (by content hash) in the
# repo.
import fileinput
from collections import defaultdict
import subprocess
digest_to_path = defaultdict(list)
p = subprocess.Popen(
["git", "ls-tree", "--full-tree", "-r", "HEAD"],
for line in p.stdout.readlines():
attrs, path = line.rstrip().split("\t")
_, _, digest = attrs.split()
for digest, paths in digest_to_path.items():
if len(paths) == 1:
del digest_to_path[digest]
for digest, paths in sorted(digest_to_path.items(), key=lambda (_, v): len(v), reverse=True):
if len(paths) > 2:
print "{} duplicates:".format(len(paths))
for path in paths:
print " - " + path.rstrip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment