Skip to content

Instantly share code, notes, and snippets.

@proxypoke
Created August 14, 2012 19:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save proxypoke/3352255 to your computer and use it in GitHub Desktop.
Save proxypoke/3352255 to your computer and use it in GitHub Desktop.
dupfile.py -- find duplicate files in a folder hierarchy
#!/usr/bin/env python
# Recursively find duplicate files.
#
# Author: slowpoke (Proxy) < mail at slowpoke dot io >
#
# This program is free software under the non-terms
# of the Anti-License. Do whatever the fuck you want.
import argparse
import hashlib
import os
import re
from collections import defaultdict
def hash(path):
'''Hash a file's content.'''
with open(path, "rb") as file_:
hash_ = hashlib.sha1(file_.read()).hexdigest()
return int(hash_, 16)
def crawl(root, *exts):
'''Recursively search a given path for files matching a list extensions.'''
# construct the regexp
regexp = "|".join(["\." + ext + "$" for ext in exts])
pattern = re.compile(regexp)
# matches are a dict of sha1sum : [filenames]
matches = defaultdict(list)
for item in os.walk(root):
paths = item[2]
for path in paths:
if re.search(pattern, path):
file_ = item[0] + os.sep + path
hash_ = hash(file_)
matches[hash_].append(file_)
return matches
def format(files):
'''Niceify the output.'''
for file_ in files.values():
# print only if there are actually duplicate files
if len(file_) > 1:
for path in file_:
print(path)
print("---")
def main():
parser = argparse.ArgumentParser(
description="Find and print files with the same content.")
parser.add_argument("path", help="root path from which to recurse")
parser.add_argument("extensions", nargs="+",
help="filetype extensions, without the leading dot "
"(eg, 'py', not '.py')")
args = parser.parse_args()
files = crawl(args.path, *args.extensions)
format(files)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment