Created
August 14, 2012 19:54
-
-
Save proxypoke/3352255 to your computer and use it in GitHub Desktop.
dupfile.py -- find duplicate files in a folder hierarchy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Recursively find duplicate files. | |
# | |
# Author: slowpoke (Proxy) < mail at slowpoke dot io > | |
# | |
# This program is free software under the non-terms | |
# of the Anti-License. Do whatever the fuck you want. | |
import argparse | |
import hashlib | |
import os | |
import re | |
from collections import defaultdict | |
def hash(path): | |
'''Hash a file's content.''' | |
with open(path, "rb") as file_: | |
hash_ = hashlib.sha1(file_.read()).hexdigest() | |
return int(hash_, 16) | |
def crawl(root, *exts): | |
'''Recursively search a given path for files matching a list extensions.''' | |
# construct the regexp | |
regexp = "|".join(["\." + ext + "$" for ext in exts]) | |
pattern = re.compile(regexp) | |
# matches are a dict of sha1sum : [filenames] | |
matches = defaultdict(list) | |
for item in os.walk(root): | |
paths = item[2] | |
for path in paths: | |
if re.search(pattern, path): | |
file_ = item[0] + os.sep + path | |
hash_ = hash(file_) | |
matches[hash_].append(file_) | |
return matches | |
def format(files): | |
'''Niceify the output.''' | |
for file_ in files.values(): | |
# print only if there are actually duplicate files | |
if len(file_) > 1: | |
for path in file_: | |
print(path) | |
print("---") | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Find and print files with the same content.") | |
parser.add_argument("path", help="root path from which to recurse") | |
parser.add_argument("extensions", nargs="+", | |
help="filetype extensions, without the leading dot " | |
"(eg, 'py', not '.py')") | |
args = parser.parse_args() | |
files = crawl(args.path, *args.extensions) | |
format(files) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment