Skip to content

Instantly share code, notes, and snippets.

@gekitsuu
Created June 5, 2021 22:58
Show Gist options
  • Save gekitsuu/98158c1f16df27e46b47c45d1f43561a to your computer and use it in GitHub Desktop.
Save gekitsuu/98158c1f16df27e46b47c45d1f43561a to your computer and use it in GitHub Desktop.
Python script to find duplicate files in directory and it's child directories
from rich import print
import pathlib
import hashlib
import pprint
# Simple script to hash a diretory and it's subdirectories to see if there are any dupes
# Here's an example awk to take the output report and get the first match in each of the duplicate files found
#
# awk '/hash matched/ { getline ; print $0 }'
def get_files(path):
p = pathlib.Path(path)
return list(p.glob("**/*"))
def determine_hash(filename):
try:
with open(filename, 'rb') as fh:
return hashlib.md5(fh.read()).hexdigest()
except (IsADirectoryError, FileNotFoundError, PermissionError):
return ''
def print_report(seen_files):
for seen_file in seen_files:
if len(seen_files[seen_file]) > 1:
print("{} hash matched:".format(seen_file))
for matched_file in seen_files[seen_file]:
print(" {}".format(matched_file))
def main():
path = "."
files = get_files(path)
seen_files = {}
for filename in files:
file_hash = determine_hash(filename)
if file_hash == '':
continue
if file_hash in seen_files:
seen_files[file_hash].append(filename)
print("[bold red]X[/bold red] Duplicate found!: {}".format(filename))
else:
print("[bold green]O[/bold green] Adding: {}".format(filename))
seen_files[file_hash] = []
seen_files[file_hash].append(filename)
print_report(seen_files)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment