Created
April 1, 2019 12:57
-
-
Save rikwatson/b9d598678ae520c5147e58f8160d7183 to your computer and use it in GitHub Desktop.
This could be good for detecting bit rot in file systems.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# 2012-01-15: Modified by Adam Porter <adam@alphapapa.net> | |
# Based on md5verify.py by Wil Clouser | |
# <http://micropipes.com/blog/2011/01/30/md5verify-a-script-to-automatically-verify-file-integrity/> | |
# <https://github.com/clouserw/scripts/blob/master/md5verify.py> | |
# Redistribution and use in source and binary forms, with or without modification, | |
# are permitted provided that the following conditions are met: | |
# | |
# Redistributions of source code must retain the above copyright notice, this | |
# list of conditions and the following disclaimer. | |
# | |
# Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# | |
# The name of the author may not be used to endorse or promote | |
# products derived from this software without specific prior written | |
# permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | |
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | |
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER | |
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | |
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN | |
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
import hashlib | |
import logging as log | |
import argparse | |
import os | |
import re | |
import sys | |
md5line = re.compile(r"^(\\?)([0-9a-f]{32}) [\ \*](.*)\ ([\d]+)$") # store the timestamp on the end (maybe incompatible with md5sum utility, but oh well) | |
errors = 0 | |
changed_files = 0 | |
missing_files = 0 | |
new_files = 0 | |
skipped_files = 0 | |
empty_files = 0 | |
updated_files = 0 | |
verified_files = 0 | |
dirs_skipped = 0 | |
total_hashfiles_size = 0 | |
total_files = 0 | |
total_filesize = 0 | |
hashes_written = False | |
something_was_done = False # use this to catch accidental scan of previously-unscanned dirs without -w flag | |
# TODO: Use a database instead of .hashes files. Dumping those all over the place isn't so great. | |
# TODO: Store and check filesize? | |
# TODO: support gzipping hash files? | |
# FIXME: some filenames with weird characters (all I see is "?" in $(ls)) may cause hash files to have newlines and report bad syntax | |
def process_directory(dir): | |
global errors, changed_files, missing_files, new_files, verified_files, dirs_skipped, skipped_files, empty_files, updated_files, hashes_written, something_was_done, total_files, total_hashfiles_size, total_filesize | |
consistent = True # Nothing bad found yet | |
oldcwd = os.getcwd() | |
def get_mtime(file): | |
mtime = int(os.stat(file).st_mtime) # .st_mtime adds a .0 but accessing it with [8] doesn't. weird. | |
# stupidly there was once a file with an mtime of around the year 1901 | |
# this works around it, since my regex skills are lacking at the moment | |
if mtime < 0: mtime = 0 | |
return mtime | |
def scan_file(file): | |
global empty_files, total_filesize, total_files | |
size = os.stat(file).st_size | |
hash = calculate_hash(file) | |
log.debug("Actual hash:%s" % hash) | |
if options.summary: | |
if not size: | |
empty_files += 1 | |
else: | |
total_filesize += size | |
total_files += 1 | |
return hash | |
for root, dirs, files in os.walk(dir, onerror=log.error): | |
log.info("Processing directory %s" % root) | |
if not files: | |
log.info("No files in directory; skipping") | |
continue | |
os.chdir(root) | |
updated = False # We haven't detected any changes | |
new_hashfile = False | |
root_consistent = True # this is per dir walked, not per argument | |
hashes_file = os.path.join(root, hashfile_name) | |
new = {} | |
hashed_files = {} | |
# Find a stored hashes file | |
if os.path.isfile(hashes_file): | |
log.info("Found existing hashes") | |
if options.summary: | |
total_hashfiles_size += os.stat(hashes_file).st_size | |
# Iterate over stored hashes | |
for hash, file, mtime, error in read_stored_hashes(hashes_file): | |
# If there was an error reading the hash from the hashes file | |
if error: | |
consistent = False | |
root_consistent = False | |
log.warning("Error in hashes file (%s)" % hashes_file) | |
# Otherwise, the hash was read from the hashes file | |
else: | |
# Add existing file to list | |
if os.path.isfile(file): | |
hashed_files[file] = {'hash': hash, 'mtime': mtime} | |
# The file is missing | |
else: | |
updated = True | |
missing_files += 1 | |
if not options.ignore_missing: | |
root_consistent = False | |
consistent = False | |
log.warning("FILE MISSING: %s" % (os.path.join(root, file))) | |
# If there's no hash file and we're not storing hashes, there's no need to hash all the files | |
elif not options.write_hashes: | |
# make sure the dir actually has files in it, not just symlinks, before throwing a warning | |
has_files = False | |
for file in files: | |
if os.path.isfile(file): | |
has_files = True | |
break | |
if has_files: | |
log.warning("No hashes found and not storing hashes; skipping directory (%s)" % root) | |
dirs_skipped += 1 | |
continue | |
# Therefore, there is no hash file, and we ARE storing hashes | |
else: | |
new_hashfile = True | |
log.debug("Hash file not found") | |
something_was_done = True | |
# Iterate over existing files | |
for file in sorted(files): | |
# Skip non-files | |
if not os.path.isfile(file): | |
log.debug("Skipping non-file: %s" % file) | |
continue | |
# skip stored hashes file TODO: checksum the stored hash file too? | |
if file == hashfile_name: | |
log.debug("Skipping hashes file") | |
continue | |
file_consistent = True | |
log.info("CHECKING FILE\t%s" % file) | |
# If we have a stored hash for the file | |
if file in hashed_files: | |
log.debug("Stored hash:%s\tstored mtime:%s" % (hashed_files[file]['hash'], hashed_files[file]['mtime'])) | |
if options.new_only: | |
log.debug("Skipping existing file") | |
skipped_files += 1 | |
continue | |
mtime = str(get_mtime(file)) | |
log.debug("mtime:%s stored mtime:%s" % (mtime, hashed_files[file]['mtime'])) | |
# If we trust updated files, there's no need to hash one whose mtime has changed, unless we're writing changes | |
if options.trust_updated and (mtime != hashed_files[file]['mtime']) and not options.write_hashes: | |
log.debug("Not hashing updated file") | |
updated_files += 1 | |
continue | |
# actually scan the file | |
hash = scan_file(file) | |
# If the stored hash differs from the current one | |
if hashed_files[file]['hash'] != hash: | |
updated = True | |
changed_files += 1 | |
# If we consider updated mtimes to be innocuous | |
if options.trust_updated: | |
# If the mtime is different, it just means the file was updated | |
if hashed_files[file]['mtime'] != mtime: | |
updated_files += 1 | |
log.info("Hash differs but mtime updated") | |
# Otherwise, if the mtime is the same, the file was altered while keeping the mtime the same | |
else: | |
log.warning("FILE ALTERED (%s): mtime is unchanged (%s)" % (os.path.join(root, file), mtime)) | |
consistent = False | |
root_consistent = False | |
file_consistent = False | |
# Otherwise, we only care that the hash is different | |
else: | |
log.debug("Hash differs") | |
if hashed_files[file]['mtime'] != mtime: | |
updated_files += 1 | |
consistent = False | |
root_consistent = False | |
file_consistent = False | |
# Warn if inconsistent | |
if file_consistent == False: | |
if os.stat(file).st_size == 0: | |
log.warning("File is now empty (%s)" % os.path.join(root, file)) | |
else: | |
log.warning('Hash changed for file (%s)\tOld: %s\tNew: %s' % (os.path.join(root, file), hashed_files[file]['hash'], hash)) | |
# Update existing files if we are doing so | |
if options.write_hashes: | |
# If things are consistent or we are forcing an update, do so | |
if file_consistent or options.force_write: | |
if options.force_write: | |
log.info('Forcing hash update of altered file (%s)' % hashed_files[file]) | |
else: | |
log.debug("Adding new hash to list") | |
hashed_files[file] = {'hash': hash, 'mtime': mtime} | |
else: | |
log.info("Hash matches") | |
verified_files += 1 | |
# Otherwise, it's a new file | |
else: | |
new_files += 1 | |
log.info("File is new") | |
# this could go below the next check for speed, but this gives more accurate stats | |
if os.stat(file).st_size == 0: | |
empty_files += 1 | |
if not options.write_hashes: | |
log.debug("Not writing hash") | |
continue | |
updated = True | |
mtime = get_mtime(file) | |
# actually scan the file | |
hash = scan_file(file) | |
new[file] = {'hash': hash, 'mtime': mtime} | |
log.debug("Finished processing directory %s" % root) | |
log.debug("updated:%s\tdirectory consistent:%s\tforce-write:%s" % (updated, root_consistent, options.force_write)) | |
# Finished iterating over existing files; now do we need to write hashes? | |
if updated or new_hashfile or options.force_write: | |
log.debug("Need to write hashes") | |
# Update existing files if we are doing so | |
if options.write_hashes: | |
if new_hashfile: | |
log.debug("Writing new hashfile:%s" % hashes_file) | |
else: | |
log.debug("Updating stored hashes:%s" % hashes_file) | |
# If things are consistent or we are forcing an update, do so | |
if root_consistent or new_hashfile or options.force_write: | |
new.update(hashed_files) | |
if not write_hashes(hashes_file, new): | |
log.warning("Failed to write hashes (%s)" % hashes_file) | |
consistent = False | |
else: | |
log.debug("Updating stored hashes:%s" % hashes_file) | |
hashes_written = True | |
else: | |
log.warning("Files have been altered in directory (%s) but --force-update is not enabled. Not updating stored hashes." % root) | |
else: | |
log.debug("Not writing hashes because -w is not set") | |
else: | |
log.debug("No need to write hashes") | |
os.chdir(oldcwd) | |
return consistent | |
def calculate_hash(file): | |
global errors, empty_files | |
md5 = hashlib.md5() | |
try: | |
with open(file, 'rb') as f: | |
while True: | |
chunk = f.read(32768) # picked a number # good choice: <http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python> | |
md5.update(chunk) | |
if not chunk: | |
return md5.hexdigest() | |
except IOError, e: | |
log.error("Error opening %s: %s" % (file, e)) | |
errors += 1 | |
return None | |
def read_stored_hashes(file): | |
global errors | |
try: | |
with open(file, 'r') as f: | |
for line in f: | |
match = md5line.match(line) | |
if match: | |
# If a line starts with \, the filename has escaped | |
# characters. Python won't expect that so we strip them. | |
if match.group(1): | |
name = (match.group(3).replace("\\\\", "\\") | |
.replace("\\\n", "\n")) | |
else: | |
name = match.group(3) | |
yield match.group(2), name, match.group(4), False # 2=md5sum, 4=mtime, False=no error | |
# If the match fails, return an error | |
else: | |
msg = "Invalid syntax in hashes file %s:%s" % (file, line) | |
log.error(msg) | |
errors += 1 | |
yield None, None, None, True | |
except IOError, e: | |
log.error("Error reading hashes file %s: %s" % (file, e)) | |
errors += 1 | |
def write_hashes(hashfile, results): | |
global errors, total_hashfiles_size | |
try: | |
# Record existing hashfile size to avoid double-counting it | |
if options.summary: | |
if os.path.isfile(hashfile): | |
oldsize = os.stat(hashfile).st_size | |
else: | |
oldsize = 0 | |
with open(hashfile, 'w') as f: | |
for name, file in results.iteritems(): | |
line = "" | |
# The md5sum utility will prefix a line with \ if it contains | |
# certain characters. We'll do the same here for compatibilty's | |
# sake. Read `info md5sum` for more info. | |
if "\\" in name or "\n" in name: | |
name = (name.replace("\\", "\\\\") | |
.replace("\n", "\\\n")) | |
line = "\\" | |
# Linux (and it's md5sum) don't care if a file is binary or not, | |
# so I'm not going to care either. If you care you'll need to: | |
# 1) Determine if the file is binary (most tools scan the file | |
# for a null char) | |
# 2) If the file is binary, change the second space in this | |
# string to an `*' | |
line = "%s%s %s %s\n" % (line, file['hash'], name, file['mtime']) | |
f.write(line) | |
if options.summary: | |
if oldsize: | |
total_hashfiles_size -= oldsize | |
total_hashfiles_size += os.stat(hashfile).st_size | |
log.debug("Hashfile written:%s" % hashfile) | |
except Exception, e: | |
log.error("Couldn't write hashes file (%s): %s: %s" % (hashfile, Exception, e)) | |
errors += 1 | |
return False | |
return True | |
def sizeof_fmt(num): | |
for x in ['bytes','KB','MB','GB']: | |
if num < 1024.0: | |
return "%3.1f %s" % (num, x) | |
num /= 1024.0 | |
return "%3.1f %s" % (num, 'TB') | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="CHAngedFIleFInder: Recursively scan directories and compare stored hashes to see if files have changed") | |
parser.add_argument('directory', nargs='+') | |
parser.add_argument("-f", "--force-write", action='store_true', dest="force_write", help="Write updated hashes even if inconsistencies are detected") | |
parser.add_argument("-i", "--ignore-missing", action='store_true', help="Ignore missing files") | |
parser.add_argument("-n", "--new-only", action='store_true', dest="new_only", help="Only hash new files") | |
#parser.add_argument("-o", "--hash-file", metavar="HASHFILE_NAME", dest='hashfile_name', default=".hashes", help=("Name of stored-hashes file (default: %(default)s)")) # it's safer to not have this, because if it was set to the wrong name it could clobber a real file | |
hashfile_name = '.hashes' | |
parser.add_argument("-s", "--summary", action="store_true", dest="summary", help=("Display summary when finished, including file sizes and hash file disk usage")) | |
parser.add_argument("-t", "--trust-updated", action='store_true', dest="trust_updated", help="Trust files whose mtimes have changed (i.e. only detect corrupted files, not updated ones)") | |
parser.add_argument("-v", "--verbose", action="count", dest="verbose", help="Print more output (up to -vv)") | |
parser.add_argument("-w", "--write-hashes", action="store_true", dest="write_hashes", help="Write hashes to .hashes file in each directory (default false)") | |
options = parser.parse_args() | |
if options.verbose == 1: | |
LOG_LEVEL = log.INFO | |
elif options.verbose >=2: | |
LOG_LEVEL = log.DEBUG | |
else: | |
LOG_LEVEL = log.WARNING | |
log.basicConfig(level=LOG_LEVEL, format="%(levelname)s: %(message)s") | |
log.debug("Options: %s" % options) | |
if not options.directory: | |
log.critical("You need to specify the directories to process, silly. :)") | |
parser.print_usage() | |
sys.exit(2) | |
# Verify arguments are directories | |
quit = False | |
for dir in options.directory: | |
if not os.path.isdir(dir): | |
log.critical("%s is not a directory" % dir) | |
quit = True | |
if quit: | |
sys.exit(2) | |
# If -f is specified, -w should probably be implied | |
if options.force_write and not options.write_hashes: | |
log.warning("-f was specified without -w. Forcing hash update anyway.") | |
options.write_hashes = True | |
# Iterate over directories | |
consistent = True | |
for dir in options.directory: | |
if not process_directory(dir): # Returns false when inconsistencies are found | |
consistent = False | |
# Catch accidental misuse | |
if not something_was_done: | |
log.error("Nothing was done. You probably scanned new directories without using -w.") | |
sys.exit(1) | |
if options.summary and not options.verbose: | |
log.getLogger().setLevel(log.INFO) | |
log.debug("Options were: %s" % options) | |
log.info("-----SUMMARY-----") | |
log.info("Total files scanned: %s" % total_files) | |
log.info("Total size of files scanned: %s" % sizeof_fmt(total_filesize)) | |
log.info("Total size of hash files: %s" % sizeof_fmt(total_hashfiles_size)) | |
log.info("Changed files:%s Empty files:%s Missing files:%s New files:%s Skipped files:%s Updated files:%s Verified files:%s" % (changed_files, empty_files, missing_files, new_files, skipped_files, updated_files, verified_files)) | |
if consistent: | |
if not options.new_only: log.info("Consistency verified.") | |
if updated_files: | |
log.info("Files were updated.") | |
if hashes_written: | |
log.info("Hashes were stored.") | |
elif changed_files or new_files: | |
log.info("Changes not stored.") | |
if dirs_skipped: | |
log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped) | |
sys.exit(2) # TODO: Should this really exit nonzero? I think it's important to catch mistakes that people wouldn't notice or expect | |
else: | |
log.warning("INCONSISTENCIES DETECTED.") | |
if dirs_skipped: | |
log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped) | |
if errors: | |
log.warning("%s ERRORS OCCURRED." % errors) | |
if hashes_written: | |
log.warning("NEW HASHES WERE WRITTEN TO DISK. THESE INCONSISTENCIES WILL NOT BE NOTICED AGAIN.") | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment