Created
June 5, 2021 22:58
-
-
Save gekitsuu/98158c1f16df27e46b47c45d1f43561a to your computer and use it in GitHub Desktop.
Python script to find duplicate files in directory and it's child directories
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rich import print | |
import pathlib | |
import hashlib | |
import pprint | |
# Simple script to hash a diretory and it's subdirectories to see if there are any dupes | |
# Here's an example awk to take the output report and get the first match in each of the duplicate files found | |
# | |
# awk '/hash matched/ { getline ; print $0 }' | |
def get_files(path): | |
p = pathlib.Path(path) | |
return list(p.glob("**/*")) | |
def determine_hash(filename): | |
try: | |
with open(filename, 'rb') as fh: | |
return hashlib.md5(fh.read()).hexdigest() | |
except (IsADirectoryError, FileNotFoundError, PermissionError): | |
return '' | |
def print_report(seen_files): | |
for seen_file in seen_files: | |
if len(seen_files[seen_file]) > 1: | |
print("{} hash matched:".format(seen_file)) | |
for matched_file in seen_files[seen_file]: | |
print(" {}".format(matched_file)) | |
def main(): | |
path = "." | |
files = get_files(path) | |
seen_files = {} | |
for filename in files: | |
file_hash = determine_hash(filename) | |
if file_hash == '': | |
continue | |
if file_hash in seen_files: | |
seen_files[file_hash].append(filename) | |
print("[bold red]X[/bold red] Duplicate found!: {}".format(filename)) | |
else: | |
print("[bold green]O[/bold green] Adding: {}".format(filename)) | |
seen_files[file_hash] = [] | |
seen_files[file_hash].append(filename) | |
print_report(seen_files) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment