Skip to content

Instantly share code, notes, and snippets.

@rikwatson
Created April 1, 2019 12:57
Show Gist options
  • Save rikwatson/b9d598678ae520c5147e58f8160d7183 to your computer and use it in GitHub Desktop.
Save rikwatson/b9d598678ae520c5147e58f8160d7183 to your computer and use it in GitHub Desktop.
This could be good for detecting bit rot in file systems.
#! /usr/bin/env python
# 2012-01-15: Modified by Adam Porter <adam@alphapapa.net>
# Based on md5verify.py by Wil Clouser
# <http://micropipes.com/blog/2011/01/30/md5verify-a-script-to-automatically-verify-file-integrity/>
# <https://github.com/clouserw/scripts/blob/master/md5verify.py>
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# The name of the author may not be used to endorse or promote
# products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import hashlib
import logging as log
import argparse
import os
import re
import sys
md5line = re.compile(r"^(\\?)([0-9a-f]{32}) [\ \*](.*)\ ([\d]+)$") # store the timestamp on the end (maybe incompatible with md5sum utility, but oh well)
errors = 0
changed_files = 0
missing_files = 0
new_files = 0
skipped_files = 0
empty_files = 0
updated_files = 0
verified_files = 0
dirs_skipped = 0
total_hashfiles_size = 0
total_files = 0
total_filesize = 0
hashes_written = False
something_was_done = False # use this to catch accidental scan of previously-unscanned dirs without -w flag
# TODO: Use a database instead of .hashes files. Dumping those all over the place isn't so great.
# TODO: Store and check filesize?
# TODO: support gzipping hash files?
# FIXME: some filenames with weird characters (all I see is "?" in $(ls)) may cause hash files to have newlines and report bad syntax
def process_directory(dir):
global errors, changed_files, missing_files, new_files, verified_files, dirs_skipped, skipped_files, empty_files, updated_files, hashes_written, something_was_done, total_files, total_hashfiles_size, total_filesize
consistent = True # Nothing bad found yet
oldcwd = os.getcwd()
def get_mtime(file):
mtime = int(os.stat(file).st_mtime) # .st_mtime adds a .0 but accessing it with [8] doesn't. weird.
# stupidly there was once a file with an mtime of around the year 1901
# this works around it, since my regex skills are lacking at the moment
if mtime < 0: mtime = 0
return mtime
def scan_file(file):
global empty_files, total_filesize, total_files
size = os.stat(file).st_size
hash = calculate_hash(file)
log.debug("Actual hash:%s" % hash)
if options.summary:
if not size:
empty_files += 1
else:
total_filesize += size
total_files += 1
return hash
for root, dirs, files in os.walk(dir, onerror=log.error):
log.info("Processing directory %s" % root)
if not files:
log.info("No files in directory; skipping")
continue
os.chdir(root)
updated = False # We haven't detected any changes
new_hashfile = False
root_consistent = True # this is per dir walked, not per argument
hashes_file = os.path.join(root, hashfile_name)
new = {}
hashed_files = {}
# Find a stored hashes file
if os.path.isfile(hashes_file):
log.info("Found existing hashes")
if options.summary:
total_hashfiles_size += os.stat(hashes_file).st_size
# Iterate over stored hashes
for hash, file, mtime, error in read_stored_hashes(hashes_file):
# If there was an error reading the hash from the hashes file
if error:
consistent = False
root_consistent = False
log.warning("Error in hashes file (%s)" % hashes_file)
# Otherwise, the hash was read from the hashes file
else:
# Add existing file to list
if os.path.isfile(file):
hashed_files[file] = {'hash': hash, 'mtime': mtime}
# The file is missing
else:
updated = True
missing_files += 1
if not options.ignore_missing:
root_consistent = False
consistent = False
log.warning("FILE MISSING: %s" % (os.path.join(root, file)))
# If there's no hash file and we're not storing hashes, there's no need to hash all the files
elif not options.write_hashes:
# make sure the dir actually has files in it, not just symlinks, before throwing a warning
has_files = False
for file in files:
if os.path.isfile(file):
has_files = True
break
if has_files:
log.warning("No hashes found and not storing hashes; skipping directory (%s)" % root)
dirs_skipped += 1
continue
# Therefore, there is no hash file, and we ARE storing hashes
else:
new_hashfile = True
log.debug("Hash file not found")
something_was_done = True
# Iterate over existing files
for file in sorted(files):
# Skip non-files
if not os.path.isfile(file):
log.debug("Skipping non-file: %s" % file)
continue
# skip stored hashes file TODO: checksum the stored hash file too?
if file == hashfile_name:
log.debug("Skipping hashes file")
continue
file_consistent = True
log.info("CHECKING FILE\t%s" % file)
# If we have a stored hash for the file
if file in hashed_files:
log.debug("Stored hash:%s\tstored mtime:%s" % (hashed_files[file]['hash'], hashed_files[file]['mtime']))
if options.new_only:
log.debug("Skipping existing file")
skipped_files += 1
continue
mtime = str(get_mtime(file))
log.debug("mtime:%s stored mtime:%s" % (mtime, hashed_files[file]['mtime']))
# If we trust updated files, there's no need to hash one whose mtime has changed, unless we're writing changes
if options.trust_updated and (mtime != hashed_files[file]['mtime']) and not options.write_hashes:
log.debug("Not hashing updated file")
updated_files += 1
continue
# actually scan the file
hash = scan_file(file)
# If the stored hash differs from the current one
if hashed_files[file]['hash'] != hash:
updated = True
changed_files += 1
# If we consider updated mtimes to be innocuous
if options.trust_updated:
# If the mtime is different, it just means the file was updated
if hashed_files[file]['mtime'] != mtime:
updated_files += 1
log.info("Hash differs but mtime updated")
# Otherwise, if the mtime is the same, the file was altered while keeping the mtime the same
else:
log.warning("FILE ALTERED (%s): mtime is unchanged (%s)" % (os.path.join(root, file), mtime))
consistent = False
root_consistent = False
file_consistent = False
# Otherwise, we only care that the hash is different
else:
log.debug("Hash differs")
if hashed_files[file]['mtime'] != mtime:
updated_files += 1
consistent = False
root_consistent = False
file_consistent = False
# Warn if inconsistent
if file_consistent == False:
if os.stat(file).st_size == 0:
log.warning("File is now empty (%s)" % os.path.join(root, file))
else:
log.warning('Hash changed for file (%s)\tOld: %s\tNew: %s' % (os.path.join(root, file), hashed_files[file]['hash'], hash))
# Update existing files if we are doing so
if options.write_hashes:
# If things are consistent or we are forcing an update, do so
if file_consistent or options.force_write:
if options.force_write:
log.info('Forcing hash update of altered file (%s)' % hashed_files[file])
else:
log.debug("Adding new hash to list")
hashed_files[file] = {'hash': hash, 'mtime': mtime}
else:
log.info("Hash matches")
verified_files += 1
# Otherwise, it's a new file
else:
new_files += 1
log.info("File is new")
# this could go below the next check for speed, but this gives more accurate stats
if os.stat(file).st_size == 0:
empty_files += 1
if not options.write_hashes:
log.debug("Not writing hash")
continue
updated = True
mtime = get_mtime(file)
# actually scan the file
hash = scan_file(file)
new[file] = {'hash': hash, 'mtime': mtime}
log.debug("Finished processing directory %s" % root)
log.debug("updated:%s\tdirectory consistent:%s\tforce-write:%s" % (updated, root_consistent, options.force_write))
# Finished iterating over existing files; now do we need to write hashes?
if updated or new_hashfile or options.force_write:
log.debug("Need to write hashes")
# Update existing files if we are doing so
if options.write_hashes:
if new_hashfile:
log.debug("Writing new hashfile:%s" % hashes_file)
else:
log.debug("Updating stored hashes:%s" % hashes_file)
# If things are consistent or we are forcing an update, do so
if root_consistent or new_hashfile or options.force_write:
new.update(hashed_files)
if not write_hashes(hashes_file, new):
log.warning("Failed to write hashes (%s)" % hashes_file)
consistent = False
else:
log.debug("Updating stored hashes:%s" % hashes_file)
hashes_written = True
else:
log.warning("Files have been altered in directory (%s) but --force-update is not enabled. Not updating stored hashes." % root)
else:
log.debug("Not writing hashes because -w is not set")
else:
log.debug("No need to write hashes")
os.chdir(oldcwd)
return consistent
def calculate_hash(file):
global errors, empty_files
md5 = hashlib.md5()
try:
with open(file, 'rb') as f:
while True:
chunk = f.read(32768) # picked a number # good choice: <http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python>
md5.update(chunk)
if not chunk:
return md5.hexdigest()
except IOError, e:
log.error("Error opening %s: %s" % (file, e))
errors += 1
return None
def read_stored_hashes(file):
global errors
try:
with open(file, 'r') as f:
for line in f:
match = md5line.match(line)
if match:
# If a line starts with \, the filename has escaped
# characters. Python won't expect that so we strip them.
if match.group(1):
name = (match.group(3).replace("\\\\", "\\")
.replace("\\\n", "\n"))
else:
name = match.group(3)
yield match.group(2), name, match.group(4), False # 2=md5sum, 4=mtime, False=no error
# If the match fails, return an error
else:
msg = "Invalid syntax in hashes file %s:%s" % (file, line)
log.error(msg)
errors += 1
yield None, None, None, True
except IOError, e:
log.error("Error reading hashes file %s: %s" % (file, e))
errors += 1
def write_hashes(hashfile, results):
global errors, total_hashfiles_size
try:
# Record existing hashfile size to avoid double-counting it
if options.summary:
if os.path.isfile(hashfile):
oldsize = os.stat(hashfile).st_size
else:
oldsize = 0
with open(hashfile, 'w') as f:
for name, file in results.iteritems():
line = ""
# The md5sum utility will prefix a line with \ if it contains
# certain characters. We'll do the same here for compatibilty's
# sake. Read `info md5sum` for more info.
if "\\" in name or "\n" in name:
name = (name.replace("\\", "\\\\")
.replace("\n", "\\\n"))
line = "\\"
# Linux (and it's md5sum) don't care if a file is binary or not,
# so I'm not going to care either. If you care you'll need to:
# 1) Determine if the file is binary (most tools scan the file
# for a null char)
# 2) If the file is binary, change the second space in this
# string to an `*'
line = "%s%s %s %s\n" % (line, file['hash'], name, file['mtime'])
f.write(line)
if options.summary:
if oldsize:
total_hashfiles_size -= oldsize
total_hashfiles_size += os.stat(hashfile).st_size
log.debug("Hashfile written:%s" % hashfile)
except Exception, e:
log.error("Couldn't write hashes file (%s): %s: %s" % (hashfile, Exception, e))
errors += 1
return False
return True
def sizeof_fmt(num):
for x in ['bytes','KB','MB','GB']:
if num < 1024.0:
return "%3.1f %s" % (num, x)
num /= 1024.0
return "%3.1f %s" % (num, 'TB')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="CHAngedFIleFInder: Recursively scan directories and compare stored hashes to see if files have changed")
parser.add_argument('directory', nargs='+')
parser.add_argument("-f", "--force-write", action='store_true', dest="force_write", help="Write updated hashes even if inconsistencies are detected")
parser.add_argument("-i", "--ignore-missing", action='store_true', help="Ignore missing files")
parser.add_argument("-n", "--new-only", action='store_true', dest="new_only", help="Only hash new files")
#parser.add_argument("-o", "--hash-file", metavar="HASHFILE_NAME", dest='hashfile_name', default=".hashes", help=("Name of stored-hashes file (default: %(default)s)")) # it's safer to not have this, because if it was set to the wrong name it could clobber a real file
hashfile_name = '.hashes'
parser.add_argument("-s", "--summary", action="store_true", dest="summary", help=("Display summary when finished, including file sizes and hash file disk usage"))
parser.add_argument("-t", "--trust-updated", action='store_true', dest="trust_updated", help="Trust files whose mtimes have changed (i.e. only detect corrupted files, not updated ones)")
parser.add_argument("-v", "--verbose", action="count", dest="verbose", help="Print more output (up to -vv)")
parser.add_argument("-w", "--write-hashes", action="store_true", dest="write_hashes", help="Write hashes to .hashes file in each directory (default false)")
options = parser.parse_args()
if options.verbose == 1:
LOG_LEVEL = log.INFO
elif options.verbose >=2:
LOG_LEVEL = log.DEBUG
else:
LOG_LEVEL = log.WARNING
log.basicConfig(level=LOG_LEVEL, format="%(levelname)s: %(message)s")
log.debug("Options: %s" % options)
if not options.directory:
log.critical("You need to specify the directories to process, silly. :)")
parser.print_usage()
sys.exit(2)
# Verify arguments are directories
quit = False
for dir in options.directory:
if not os.path.isdir(dir):
log.critical("%s is not a directory" % dir)
quit = True
if quit:
sys.exit(2)
# If -f is specified, -w should probably be implied
if options.force_write and not options.write_hashes:
log.warning("-f was specified without -w. Forcing hash update anyway.")
options.write_hashes = True
# Iterate over directories
consistent = True
for dir in options.directory:
if not process_directory(dir): # Returns false when inconsistencies are found
consistent = False
# Catch accidental misuse
if not something_was_done:
log.error("Nothing was done. You probably scanned new directories without using -w.")
sys.exit(1)
if options.summary and not options.verbose:
log.getLogger().setLevel(log.INFO)
log.debug("Options were: %s" % options)
log.info("-----SUMMARY-----")
log.info("Total files scanned: %s" % total_files)
log.info("Total size of files scanned: %s" % sizeof_fmt(total_filesize))
log.info("Total size of hash files: %s" % sizeof_fmt(total_hashfiles_size))
log.info("Changed files:%s Empty files:%s Missing files:%s New files:%s Skipped files:%s Updated files:%s Verified files:%s" % (changed_files, empty_files, missing_files, new_files, skipped_files, updated_files, verified_files))
if consistent:
if not options.new_only: log.info("Consistency verified.")
if updated_files:
log.info("Files were updated.")
if hashes_written:
log.info("Hashes were stored.")
elif changed_files or new_files:
log.info("Changes not stored.")
if dirs_skipped:
log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped)
sys.exit(2) # TODO: Should this really exit nonzero? I think it's important to catch mistakes that people wouldn't notice or expect
else:
log.warning("INCONSISTENCIES DETECTED.")
if dirs_skipped:
log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped)
if errors:
log.warning("%s ERRORS OCCURRED." % errors)
if hashes_written:
log.warning("NEW HASHES WERE WRITTEN TO DISK. THESE INCONSISTENCIES WILL NOT BE NOTICED AGAIN.")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment