Skip to content

Instantly share code, notes, and snippets.

@yourcelf
Created February 17, 2013 16:05
Show Gist options
  • Save yourcelf/4972006 to your computer and use it in GitHub Desktop.
Save yourcelf/4972006 to your computer and use it in GitHub Desktop.
Python script to check whether all the files found anywhere in the source directory are also found somewhere in the dest directory. I use it for, e.g., cleaning up after importing mp3's or photos using a program that changes directory layouts. Checks identity using either md5sum or filename. Run with "--help" for usage.
#!/usr/bin/env python3.2
import os
import sys
import hashlib
import argparse
from functools import partial
def md5sum(filename):
# http://stackoverflow.com/a/7829658
with open(filename, mode='rb') as fh:
d = hashlib.md5()
for buf in iter(partial(fh.read, 128), b''):
d.update(buf)
return d.hexdigest()
def build_md5_tree(directory):
checksums = {}
for root, dirs, files in os.walk(directory):
for filename in files:
path = os.path.join(root, filename)
checksums[md5sum(path)] = path
return checksums
def build_name_tree(directory):
names = {}
for root, dirs, files in os.walk(directory):
for filename in files:
path = os.path.join(root, filename)
names[filename] = path
return names
def find_missing_files(srces, dests, strategy):
for directories in (srces, dests):
for directory in directories:
if not os.path.exists(directory) and os.path.isdir(directory):
return "{0} is not a directory.".format(directory), []
if strategy == "names":
method = build_name_tree
elif strategy == "md5":
method = build_md5_tree
src_tree = {}
for src in srces:
src_tree.update(method(src))
dest_tree = {}
for dest in dests:
dest_tree.update(method(dest))
missing = []
for checksum, path in src_tree.items():
if checksum not in dest_tree:
missing.append((path, checksum))
return (None, missing)
def run():
parser = argparse.ArgumentParser(description="Checks to see whether every file descendant of the `src` directory is found somewhere in the `dest` directory or its descendants.")
parser.add_argument("src", nargs=1, help="Source directory, the contents of which will be sought in the dest directory.")
parser.add_argument("dest", nargs=1, help="Destination directory, the contents of which will be checked for the files from source directory.")
parser.add_argument("--names", help="Use names instead of md5 sums.", action='store_const', const=True)
args = parser.parse_args()
if args.names:
strategy = "names"
else:
strategy = "md5"
(error, missing) = find_missing_files(args.src, args.dest, strategy)
if error:
sys.stderr.write(error + "\n")
sys.exit(1)
elif missing:
missing.sort()
sys.stderr.write("Missing files found.\n")
for path, checksum in missing:
print(checksum, path)
else:
sys.stderr.write("No missing files found.\n")
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment