rikwatson/chafifi.py

## chafifi.py
#! /usr/bin/env python

# 2012-01-15: Modified by Adam Porter <adam@alphapapa.net>
#             Based on md5verify.py by Wil Clouser
#             <http://micropipes.com/blog/2011/01/30/md5verify-a-script-to-automatically-verify-file-integrity/>
#             <https://github.com/clouserw/scripts/blob/master/md5verify.py>

# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# The name of the author may not be used to endorse or promote
# products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import hashlib
import logging as log
import argparse
import os
import re
import sys

md5line = re.compile(r"^(\\?)([0-9a-f]{32}) [\ \*](.*)\ ([\d]+)$")  # store the timestamp on the end (maybe incompatible with md5sum utility, but oh well)

errors = 0

changed_files = 0
missing_files = 0
new_files = 0
skipped_files = 0
empty_files = 0
updated_files = 0
verified_files = 0
dirs_skipped = 0

total_hashfiles_size = 0
total_files = 0
total_filesize = 0
hashes_written = False
something_was_done = False  # use this to catch accidental scan of previously-unscanned dirs without -w flag

# TODO: Use a database instead of .hashes files.  Dumping those all over the place isn't so great.
# TODO: Store and check filesize?
# TODO: support gzipping hash files?
# FIXME: some filenames with weird characters (all I see is "?" in $(ls)) may cause hash files to have newlines and report bad syntax

def process_directory(dir):

    global errors, changed_files, missing_files, new_files, verified_files, dirs_skipped, skipped_files, empty_files, updated_files, hashes_written, something_was_done, total_files, total_hashfiles_size, total_filesize

    consistent = True  # Nothing bad found yet
    oldcwd = os.getcwd()

    def get_mtime(file):
        mtime = int(os.stat(file).st_mtime)  # .st_mtime adds a .0 but accessing it with [8] doesn't.  weird.

        # stupidly there was once a file with an mtime of around the year 1901
        # this works around it, since my regex skills are lacking at the moment
        if mtime < 0: mtime = 0

        return mtime

    def scan_file(file):

        global empty_files, total_filesize, total_files

        size = os.stat(file).st_size
        hash = calculate_hash(file)

        log.debug("Actual hash:%s" % hash)

        if options.summary:
            if not size:
                empty_files += 1
            else:
                total_filesize += size
            total_files += 1

        return hash

    for root, dirs, files in os.walk(dir, onerror=log.error):

        log.info("Processing directory %s" % root)

        if not files:
            log.info("No files in directory; skipping")
            continue

        os.chdir(root)
        updated = False  # We haven't detected any changes
        new_hashfile = False
        root_consistent = True  # this is per dir walked, not per argument
        hashes_file = os.path.join(root, hashfile_name)

        new = {}
        hashed_files = {}

        # Find a stored hashes file
        if os.path.isfile(hashes_file):
            log.info("Found existing hashes")

            if options.summary:
                total_hashfiles_size += os.stat(hashes_file).st_size

            # Iterate over stored hashes
            for hash, file, mtime, error in read_stored_hashes(hashes_file):

                # If there was an error reading the hash from the hashes file
                if error:
                    consistent = False
                    root_consistent = False
                    log.warning("Error in hashes file (%s)" % hashes_file)

                # Otherwise, the hash was read from the hashes file
                else:
                    # Add existing file to list
                    if os.path.isfile(file):
                        hashed_files[file] = {'hash': hash, 'mtime': mtime}

                    # The file is missing
                    else:
                        updated = True
                        missing_files += 1

                        if not options.ignore_missing:
                            root_consistent = False
                            consistent = False
                            log.warning("FILE MISSING: %s" % (os.path.join(root, file)))

        # If there's no hash file and we're not storing hashes, there's no need to hash all the files
        elif not options.write_hashes:

            # make sure the dir actually has files in it, not just symlinks, before throwing a warning
            has_files = False
            for file in files:
                if os.path.isfile(file):
                    has_files = True
                    break

            if has_files:
                log.warning("No hashes found and not storing hashes; skipping directory (%s)" % root)
                dirs_skipped += 1

            continue

        # Therefore, there is no hash file, and we ARE storing hashes
        else:
            new_hashfile = True
            log.debug("Hash file not found")

        something_was_done = True

        # Iterate over existing files
        for file in sorted(files):

            # Skip non-files
            if not os.path.isfile(file):
                log.debug("Skipping non-file: %s" % file)
                continue

            # skip stored hashes file  TODO: checksum the stored hash file too?
            if file == hashfile_name:
                log.debug("Skipping hashes file")
                continue

            file_consistent = True

            log.info("CHECKING FILE\t%s" % file)

            # If we have a stored hash for the file
            if file in hashed_files:

                log.debug("Stored hash:%s\tstored mtime:%s" % (hashed_files[file]['hash'], hashed_files[file]['mtime']))

                if options.new_only:
                    log.debug("Skipping existing file")
                    skipped_files += 1
                    continue

                mtime = str(get_mtime(file))

                log.debug("mtime:%s  stored mtime:%s" % (mtime, hashed_files[file]['mtime']))

                # If we trust updated files, there's no need to hash one whose mtime has changed, unless we're writing changes
                if options.trust_updated and (mtime != hashed_files[file]['mtime']) and not options.write_hashes:
                    log.debug("Not hashing updated file")
                    updated_files += 1
                    continue

                # actually scan the file
                hash = scan_file(file)

                # If the stored hash differs from the current one
                if hashed_files[file]['hash'] != hash:

                    updated = True
                    changed_files += 1

                    # If we consider updated mtimes to be innocuous
                    if options.trust_updated:

                        # If the mtime is different, it just means the file was updated
                        if hashed_files[file]['mtime'] != mtime:
                            updated_files += 1
                            log.info("Hash differs but mtime updated")

                        # Otherwise, if the mtime is the same, the file was altered while keeping the mtime the same
                        else:
                            log.warning("FILE ALTERED (%s): mtime is unchanged (%s)" % (os.path.join(root, file), mtime))
                            consistent = False
                            root_consistent = False
                            file_consistent = False

                    # Otherwise, we only care that the hash is different
                    else:
                        log.debug("Hash differs")

                        if hashed_files[file]['mtime'] != mtime:
                            updated_files += 1

                        consistent = False
                        root_consistent = False
                        file_consistent = False

                    # Warn if inconsistent
                    if file_consistent == False:
                        if os.stat(file).st_size == 0:
                            log.warning("File is now empty (%s)" % os.path.join(root, file))
                        else:
                            log.warning('Hash changed for file (%s)\tOld: %s\tNew: %s' % (os.path.join(root, file), hashed_files[file]['hash'], hash))

                    # Update existing files if we are doing so
                    if options.write_hashes:

                        # If things are consistent or we are forcing an update, do so
                        if file_consistent or options.force_write:
                            if options.force_write:
                                log.info('Forcing hash update of altered file (%s)' % hashed_files[file])
                            else:
                                log.debug("Adding new hash to list")

                            hashed_files[file] = {'hash': hash, 'mtime': mtime}
                else:
                    log.info("Hash matches")
                    verified_files += 1

            # Otherwise, it's a new file
            else:
                new_files += 1

                log.info("File is new")

                # this could go below the next check for speed, but this gives more accurate stats
                if os.stat(file).st_size == 0:
                    empty_files += 1

                if not options.write_hashes:
                    log.debug("Not writing hash")
                    continue

                updated = True

                mtime = get_mtime(file)

                # actually scan the file
                hash = scan_file(file)

                new[file] = {'hash': hash, 'mtime': mtime}

        log.debug("Finished processing directory %s" % root)
        log.debug("updated:%s\tdirectory consistent:%s\tforce-write:%s" % (updated, root_consistent, options.force_write))

        # Finished iterating over existing files; now do we need to write hashes?
        if updated or new_hashfile or options.force_write:

            log.debug("Need to write hashes")

            # Update existing files if we are doing so
            if options.write_hashes:

                if new_hashfile:
                    log.debug("Writing new hashfile:%s" % hashes_file)
                else:
                    log.debug("Updating stored hashes:%s" % hashes_file)

                # If things are consistent or we are forcing an update, do so
                if root_consistent or new_hashfile or options.force_write:

                    new.update(hashed_files)

                    if not write_hashes(hashes_file, new):
                        log.warning("Failed to write hashes (%s)" % hashes_file)

                        consistent = False
                    else:
                        log.debug("Updating stored hashes:%s" % hashes_file)

                        hashes_written = True
                else:
                    log.warning("Files have been altered in directory (%s) but --force-update is not enabled.  Not updating stored hashes." % root)
            else:
                log.debug("Not writing hashes because -w is not set")
        else:
            log.debug("No need to write hashes")

    os.chdir(oldcwd)

    return consistent


def calculate_hash(file):

    global errors, empty_files

    md5 = hashlib.md5()
    try:
        with open(file, 'rb') as f:
            while True:
                chunk = f.read(32768)  # picked a number  # good choice: <http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python>
                md5.update(chunk)
                if not chunk:
                    return md5.hexdigest()
    except IOError, e:
        log.error("Error opening %s: %s" % (file, e))
        errors += 1
        return None


def read_stored_hashes(file):

    global errors

    try:
        with open(file, 'r') as f:
            for line in f:
                match = md5line.match(line)

                if match:

                     # If a line starts with \, the filename has escaped
                    # characters. Python won't expect that so we strip them.
                    if match.group(1):
                        name = (match.group(3).replace("\\\\", "\\")
                                            .replace("\\\n", "\n"))
                    else:
                        name = match.group(3)

                    yield match.group(2), name, match.group(4), False  # 2=md5sum, 4=mtime, False=no error

                # If the match fails, return an error
                else:
                    msg = "Invalid syntax in hashes file %s:%s" % (file, line)
                    log.error(msg)
                    errors += 1

                    yield None, None, None, True

    except IOError, e:
        log.error("Error reading hashes file %s: %s" % (file, e))
        errors += 1


def write_hashes(hashfile, results):

    global errors, total_hashfiles_size

    try:

        # Record existing hashfile size to avoid double-counting it
        if options.summary:
            if os.path.isfile(hashfile):
                oldsize = os.stat(hashfile).st_size
            else:
                oldsize = 0

        with open(hashfile, 'w') as f:
            for name, file in results.iteritems():
                line = ""

                # The md5sum utility will prefix a line with \ if it contains
                # certain characters.  We'll do the same here for compatibilty's
                # sake.  Read `info md5sum` for more info.
                if "\\" in name or "\n" in name:
                    name = (name.replace("\\", "\\\\")
                                .replace("\n", "\\\n"))
                    line = "\\"

                # Linux (and it's md5sum) don't care if a file is binary or not,
                # so I'm not going to care either.  If you care you'll need to:
                # 1) Determine if the file is binary (most tools scan the file
                #    for a null char)
                # 2) If the file is binary, change the second space in this
                #    string to an `*'
                line = "%s%s  %s %s\n" % (line, file['hash'], name, file['mtime'])

                f.write(line)

        if options.summary:
            if oldsize:
                total_hashfiles_size -= oldsize
            total_hashfiles_size += os.stat(hashfile).st_size

        log.debug("Hashfile written:%s" % hashfile)

    except Exception, e:
        log.error("Couldn't write hashes file (%s): %s:  %s" % (hashfile, Exception, e))
        errors += 1
        return False

    return True

def sizeof_fmt(num):
    for x in ['bytes','KB','MB','GB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0
    return "%3.1f %s" % (num, 'TB')

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="CHAngedFIleFInder: Recursively scan directories and compare stored hashes to see if files have changed")

    parser.add_argument('directory', nargs='+')
    parser.add_argument("-f", "--force-write", action='store_true', dest="force_write", help="Write updated hashes even if inconsistencies are detected")
    parser.add_argument("-i", "--ignore-missing", action='store_true', help="Ignore missing files")
    parser.add_argument("-n", "--new-only", action='store_true', dest="new_only", help="Only hash new files")
    #parser.add_argument("-o", "--hash-file", metavar="HASHFILE_NAME", dest='hashfile_name', default=".hashes", help=("Name of stored-hashes file (default: %(default)s)"))  # it's safer to not have this, because if it was set to the wrong name it could clobber a real file
    hashfile_name = '.hashes'
    parser.add_argument("-s", "--summary", action="store_true", dest="summary", help=("Display summary when finished, including file sizes and hash file disk usage"))
    parser.add_argument("-t", "--trust-updated", action='store_true', dest="trust_updated", help="Trust files whose mtimes have changed (i.e. only detect corrupted files, not updated ones)")
    parser.add_argument("-v", "--verbose", action="count", dest="verbose", help="Print more output (up to -vv)")
    parser.add_argument("-w", "--write-hashes", action="store_true", dest="write_hashes", help="Write hashes to .hashes file in each directory (default false)")

    options = parser.parse_args()

    if options.verbose == 1:
        LOG_LEVEL = log.INFO
    elif options.verbose >=2:
        LOG_LEVEL = log.DEBUG
    else:
        LOG_LEVEL = log.WARNING

    log.basicConfig(level=LOG_LEVEL, format="%(levelname)s: %(message)s")

    log.debug("Options: %s" % options)

    if not options.directory:
        log.critical("You need to specify the directories to process, silly.  :)")
        parser.print_usage()
        sys.exit(2)

    # Verify arguments are directories
    quit = False
    for dir in options.directory:
        if not os.path.isdir(dir):
            log.critical("%s is not a directory" % dir)
            quit = True
    if quit:
        sys.exit(2)

    # If -f is specified, -w should probably be implied
    if options.force_write and not options.write_hashes:
        log.warning("-f was specified without -w.  Forcing hash update anyway.")
        options.write_hashes = True

    # Iterate over directories
    consistent = True
    for dir in options.directory:
        if not process_directory(dir):  # Returns false when inconsistencies are found
            consistent = False

    # Catch accidental misuse
    if not something_was_done:
        log.error("Nothing was done.  You probably scanned new directories without using -w.")
        sys.exit(1)

    if options.summary and not options.verbose:
        log.getLogger().setLevel(log.INFO)

    log.debug("Options were: %s" % options)

    log.info("-----SUMMARY-----")
    log.info("Total files scanned: %s" % total_files)
    log.info("Total size of files scanned: %s" % sizeof_fmt(total_filesize))
    log.info("Total size of hash files: %s" % sizeof_fmt(total_hashfiles_size))
    log.info("Changed files:%s  Empty files:%s  Missing files:%s  New files:%s  Skipped files:%s  Updated files:%s  Verified files:%s" %  (changed_files, empty_files, missing_files, new_files, skipped_files, updated_files, verified_files))

    if consistent:
        if not options.new_only: log.info("Consistency verified.")

        if updated_files:
            log.info("Files were updated.")
        if hashes_written:
            log.info("Hashes were stored.")
        elif changed_files or new_files:
            log.info("Changes not stored.")

        if dirs_skipped:
            log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped)
            sys.exit(2)  # TODO: Should this really exit nonzero?  I think it's important to catch mistakes that people wouldn't notice or expect

    else:
        log.warning("INCONSISTENCIES DETECTED.")

        if dirs_skipped:
            log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped)
        if errors:
            log.warning("%s ERRORS OCCURRED." % errors)
        if hashes_written:
            log.warning("NEW HASHES WERE WRITTEN TO DISK.  THESE INCONSISTENCIES WILL NOT BE NOTICED AGAIN.")

        sys.exit(1)
	#! /usr/bin/env python

	# 2012-01-15: Modified by Adam Porter <adam@alphapapa.net>
	# Based on md5verify.py by Wil Clouser
	# <http://micropipes.com/blog/2011/01/30/md5verify-a-script-to-automatically-verify-file-integrity/>
	# <https://github.com/clouserw/scripts/blob/master/md5verify.py>

	# Redistribution and use in source and binary forms, with or without modification,
	# are permitted provided that the following conditions are met:
	#
	# Redistributions of source code must retain the above copyright notice, this
	# list of conditions and the following disclaimer.
	#
	# Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# The name of the author may not be used to endorse or promote
	# products derived from this software without specific prior written
	# permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
	# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
	# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
	# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
	# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	import hashlib
	import logging as log
	import argparse
	import os
	import re
	import sys

	md5line = re.compile(r"^(\\?)([0-9a-f]{32}) [\ \](.)\ ([\d]+)$") # store the timestamp on the end (maybe incompatible with md5sum utility, but oh well)

	errors = 0

	changed_files = 0
	missing_files = 0
	new_files = 0
	skipped_files = 0
	empty_files = 0
	updated_files = 0
	verified_files = 0
	dirs_skipped = 0

	total_hashfiles_size = 0
	total_files = 0
	total_filesize = 0
	hashes_written = False
	something_was_done = False # use this to catch accidental scan of previously-unscanned dirs without -w flag

	# TODO: Use a database instead of .hashes files. Dumping those all over the place isn't so great.
	# TODO: Store and check filesize?
	# TODO: support gzipping hash files?
	# FIXME: some filenames with weird characters (all I see is "?" in $(ls)) may cause hash files to have newlines and report bad syntax

	def process_directory(dir):

	global errors, changed_files, missing_files, new_files, verified_files, dirs_skipped, skipped_files, empty_files, updated_files, hashes_written, something_was_done, total_files, total_hashfiles_size, total_filesize

	consistent = True # Nothing bad found yet
	oldcwd = os.getcwd()

	def get_mtime(file):
	mtime = int(os.stat(file).st_mtime) # .st_mtime adds a .0 but accessing it with [8] doesn't. weird.

	# stupidly there was once a file with an mtime of around the year 1901
	# this works around it, since my regex skills are lacking at the moment
	if mtime < 0: mtime = 0

	return mtime

	def scan_file(file):

	global empty_files, total_filesize, total_files

	size = os.stat(file).st_size
	hash = calculate_hash(file)

	log.debug("Actual hash:%s" % hash)

	if options.summary:
	if not size:
	empty_files += 1
	else:
	total_filesize += size
	total_files += 1

	return hash

	for root, dirs, files in os.walk(dir, onerror=log.error):

	log.info("Processing directory %s" % root)

	if not files:
	log.info("No files in directory; skipping")
	continue

	os.chdir(root)
	updated = False # We haven't detected any changes
	new_hashfile = False
	root_consistent = True # this is per dir walked, not per argument
	hashes_file = os.path.join(root, hashfile_name)

	new = {}
	hashed_files = {}

	# Find a stored hashes file
	if os.path.isfile(hashes_file):
	log.info("Found existing hashes")

	if options.summary:
	total_hashfiles_size += os.stat(hashes_file).st_size

	# Iterate over stored hashes
	for hash, file, mtime, error in read_stored_hashes(hashes_file):

	# If there was an error reading the hash from the hashes file
	if error:
	consistent = False
	root_consistent = False
	log.warning("Error in hashes file (%s)" % hashes_file)

	# Otherwise, the hash was read from the hashes file
	else:
	# Add existing file to list
	if os.path.isfile(file):
	hashed_files[file] = {'hash': hash, 'mtime': mtime}

	# The file is missing
	else:
	updated = True
	missing_files += 1

	if not options.ignore_missing:
	root_consistent = False
	consistent = False
	log.warning("FILE MISSING: %s" % (os.path.join(root, file)))

	# If there's no hash file and we're not storing hashes, there's no need to hash all the files
	elif not options.write_hashes:

	# make sure the dir actually has files in it, not just symlinks, before throwing a warning
	has_files = False
	for file in files:
	if os.path.isfile(file):
	has_files = True
	break

	if has_files:
	log.warning("No hashes found and not storing hashes; skipping directory (%s)" % root)
	dirs_skipped += 1

	continue

	# Therefore, there is no hash file, and we ARE storing hashes
	else:
	new_hashfile = True
	log.debug("Hash file not found")

	something_was_done = True

	# Iterate over existing files
	for file in sorted(files):

	# Skip non-files
	if not os.path.isfile(file):
	log.debug("Skipping non-file: %s" % file)
	continue

	# skip stored hashes file TODO: checksum the stored hash file too?
	if file == hashfile_name:
	log.debug("Skipping hashes file")
	continue

	file_consistent = True

	log.info("CHECKING FILE\t%s" % file)

	# If we have a stored hash for the file
	if file in hashed_files:

	log.debug("Stored hash:%s\tstored mtime:%s" % (hashed_files[file]['hash'], hashed_files[file]['mtime']))

	if options.new_only:
	log.debug("Skipping existing file")
	skipped_files += 1
	continue

	mtime = str(get_mtime(file))

	log.debug("mtime:%s stored mtime:%s" % (mtime, hashed_files[file]['mtime']))

	# If we trust updated files, there's no need to hash one whose mtime has changed, unless we're writing changes
	if options.trust_updated and (mtime != hashed_files[file]['mtime']) and not options.write_hashes:
	log.debug("Not hashing updated file")
	updated_files += 1
	continue

	# actually scan the file
	hash = scan_file(file)

	# If the stored hash differs from the current one
	if hashed_files[file]['hash'] != hash:

	updated = True
	changed_files += 1

	# If we consider updated mtimes to be innocuous
	if options.trust_updated:

	# If the mtime is different, it just means the file was updated
	if hashed_files[file]['mtime'] != mtime:
	updated_files += 1
	log.info("Hash differs but mtime updated")

	# Otherwise, if the mtime is the same, the file was altered while keeping the mtime the same
	else:
	log.warning("FILE ALTERED (%s): mtime is unchanged (%s)" % (os.path.join(root, file), mtime))
	consistent = False
	root_consistent = False
	file_consistent = False

	# Otherwise, we only care that the hash is different
	else:
	log.debug("Hash differs")

	if hashed_files[file]['mtime'] != mtime:
	updated_files += 1

	consistent = False
	root_consistent = False
	file_consistent = False

	# Warn if inconsistent
	if file_consistent == False:
	if os.stat(file).st_size == 0:
	log.warning("File is now empty (%s)" % os.path.join(root, file))
	else:
	log.warning('Hash changed for file (%s)\tOld: %s\tNew: %s' % (os.path.join(root, file), hashed_files[file]['hash'], hash))

	# Update existing files if we are doing so
	if options.write_hashes:

	# If things are consistent or we are forcing an update, do so
	if file_consistent or options.force_write:
	if options.force_write:
	log.info('Forcing hash update of altered file (%s)' % hashed_files[file])
	else:
	log.debug("Adding new hash to list")

	hashed_files[file] = {'hash': hash, 'mtime': mtime}
	else:
	log.info("Hash matches")
	verified_files += 1

	# Otherwise, it's a new file
	else:
	new_files += 1

	log.info("File is new")

	# this could go below the next check for speed, but this gives more accurate stats
	if os.stat(file).st_size == 0:
	empty_files += 1

	if not options.write_hashes:
	log.debug("Not writing hash")
	continue

	updated = True

	mtime = get_mtime(file)

	# actually scan the file
	hash = scan_file(file)

	new[file] = {'hash': hash, 'mtime': mtime}

	log.debug("Finished processing directory %s" % root)
	log.debug("updated:%s\tdirectory consistent:%s\tforce-write:%s" % (updated, root_consistent, options.force_write))

	# Finished iterating over existing files; now do we need to write hashes?
	if updated or new_hashfile or options.force_write:

	log.debug("Need to write hashes")

	# Update existing files if we are doing so
	if options.write_hashes:

	if new_hashfile:
	log.debug("Writing new hashfile:%s" % hashes_file)
	else:
	log.debug("Updating stored hashes:%s" % hashes_file)

	# If things are consistent or we are forcing an update, do so
	if root_consistent or new_hashfile or options.force_write:

	new.update(hashed_files)

	if not write_hashes(hashes_file, new):
	log.warning("Failed to write hashes (%s)" % hashes_file)

	consistent = False
	else:
	log.debug("Updating stored hashes:%s" % hashes_file)

	hashes_written = True
	else:
	log.warning("Files have been altered in directory (%s) but --force-update is not enabled. Not updating stored hashes." % root)
	else:
	log.debug("Not writing hashes because -w is not set")
	else:
	log.debug("No need to write hashes")

	os.chdir(oldcwd)

	return consistent


	def calculate_hash(file):

	global errors, empty_files

	md5 = hashlib.md5()
	try:
	with open(file, 'rb') as f:
	while True:
	chunk = f.read(32768) # picked a number # good choice: <http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python>
	md5.update(chunk)
	if not chunk:
	return md5.hexdigest()
	except IOError, e:
	log.error("Error opening %s: %s" % (file, e))
	errors += 1
	return None


	def read_stored_hashes(file):

	global errors

	try:
	with open(file, 'r') as f:
	for line in f:
	match = md5line.match(line)

	if match:

	# If a line starts with \, the filename has escaped
	# characters. Python won't expect that so we strip them.
	if match.group(1):
	name = (match.group(3).replace("\\\\", "\\")
	.replace("\\\n", "\n"))
	else:
	name = match.group(3)

	yield match.group(2), name, match.group(4), False # 2=md5sum, 4=mtime, False=no error

	# If the match fails, return an error
	else:
	msg = "Invalid syntax in hashes file %s:%s" % (file, line)
	log.error(msg)
	errors += 1

	yield None, None, None, True

	except IOError, e:
	log.error("Error reading hashes file %s: %s" % (file, e))
	errors += 1


	def write_hashes(hashfile, results):

	global errors, total_hashfiles_size

	try:

	# Record existing hashfile size to avoid double-counting it
	if options.summary:
	if os.path.isfile(hashfile):
	oldsize = os.stat(hashfile).st_size
	else:
	oldsize = 0

	with open(hashfile, 'w') as f:
	for name, file in results.iteritems():
	line = ""

	# The md5sum utility will prefix a line with \ if it contains
	# certain characters. We'll do the same here for compatibilty's
	# sake. Read `info md5sum` for more info.
	if "\\" in name or "\n" in name:
	name = (name.replace("\\", "\\\\")
	.replace("\n", "\\\n"))
	line = "\\"

	# Linux (and it's md5sum) don't care if a file is binary or not,
	# so I'm not going to care either. If you care you'll need to:
	# 1) Determine if the file is binary (most tools scan the file
	# for a null char)
	# 2) If the file is binary, change the second space in this
	# string to an `*'
	line = "%s%s %s %s\n" % (line, file['hash'], name, file['mtime'])

	f.write(line)

	if options.summary:
	if oldsize:
	total_hashfiles_size -= oldsize
	total_hashfiles_size += os.stat(hashfile).st_size

	log.debug("Hashfile written:%s" % hashfile)

	except Exception, e:
	log.error("Couldn't write hashes file (%s): %s: %s" % (hashfile, Exception, e))
	errors += 1
	return False

	return True

	def sizeof_fmt(num):
	for x in ['bytes','KB','MB','GB']:
	if num < 1024.0:
	return "%3.1f %s" % (num, x)
	num /= 1024.0
	return "%3.1f %s" % (num, 'TB')

	if __name__ == "__main__":

	parser = argparse.ArgumentParser(description="CHAngedFIleFInder: Recursively scan directories and compare stored hashes to see if files have changed")

	parser.add_argument('directory', nargs='+')
	parser.add_argument("-f", "--force-write", action='store_true', dest="force_write", help="Write updated hashes even if inconsistencies are detected")
	parser.add_argument("-i", "--ignore-missing", action='store_true', help="Ignore missing files")
	parser.add_argument("-n", "--new-only", action='store_true', dest="new_only", help="Only hash new files")
	#parser.add_argument("-o", "--hash-file", metavar="HASHFILE_NAME", dest='hashfile_name', default=".hashes", help=("Name of stored-hashes file (default: %(default)s)")) # it's safer to not have this, because if it was set to the wrong name it could clobber a real file
	hashfile_name = '.hashes'
	parser.add_argument("-s", "--summary", action="store_true", dest="summary", help=("Display summary when finished, including file sizes and hash file disk usage"))
	parser.add_argument("-t", "--trust-updated", action='store_true', dest="trust_updated", help="Trust files whose mtimes have changed (i.e. only detect corrupted files, not updated ones)")
	parser.add_argument("-v", "--verbose", action="count", dest="verbose", help="Print more output (up to -vv)")
	parser.add_argument("-w", "--write-hashes", action="store_true", dest="write_hashes", help="Write hashes to .hashes file in each directory (default false)")

	options = parser.parse_args()

	if options.verbose == 1:
	LOG_LEVEL = log.INFO
	elif options.verbose >=2:
	LOG_LEVEL = log.DEBUG
	else:
	LOG_LEVEL = log.WARNING

	log.basicConfig(level=LOG_LEVEL, format="%(levelname)s: %(message)s")

	log.debug("Options: %s" % options)

	if not options.directory:
	log.critical("You need to specify the directories to process, silly. :)")
	parser.print_usage()
	sys.exit(2)

	# Verify arguments are directories
	quit = False
	for dir in options.directory:
	if not os.path.isdir(dir):
	log.critical("%s is not a directory" % dir)
	quit = True
	if quit:
	sys.exit(2)

	# If -f is specified, -w should probably be implied
	if options.force_write and not options.write_hashes:
	log.warning("-f was specified without -w. Forcing hash update anyway.")
	options.write_hashes = True

	# Iterate over directories
	consistent = True
	for dir in options.directory:
	if not process_directory(dir): # Returns false when inconsistencies are found
	consistent = False

	# Catch accidental misuse
	if not something_was_done:
	log.error("Nothing was done. You probably scanned new directories without using -w.")
	sys.exit(1)

	if options.summary and not options.verbose:
	log.getLogger().setLevel(log.INFO)

	log.debug("Options were: %s" % options)

	log.info("-----SUMMARY-----")
	log.info("Total files scanned: %s" % total_files)
	log.info("Total size of files scanned: %s" % sizeof_fmt(total_filesize))
	log.info("Total size of hash files: %s" % sizeof_fmt(total_hashfiles_size))
	log.info("Changed files:%s Empty files:%s Missing files:%s New files:%s Skipped files:%s Updated files:%s Verified files:%s" % (changed_files, empty_files, missing_files, new_files, skipped_files, updated_files, verified_files))

	if consistent:
	if not options.new_only: log.info("Consistency verified.")

	if updated_files:
	log.info("Files were updated.")
	if hashes_written:
	log.info("Hashes were stored.")
	elif changed_files or new_files:
	log.info("Changes not stored.")

	if dirs_skipped:
	log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped)
	sys.exit(2) # TODO: Should this really exit nonzero? I think it's important to catch mistakes that people wouldn't notice or expect

	else:
	log.warning("INCONSISTENCIES DETECTED.")

	if dirs_skipped:
	log.warning("%s directories were skipped because they had no stored hashes and -w was not used." % dirs_skipped)
	if errors:
	log.warning("%s ERRORS OCCURRED." % errors)
	if hashes_written:
	log.warning("NEW HASHES WERE WRITTEN TO DISK. THESE INCONSISTENCIES WILL NOT BE NOTICED AGAIN.")

	sys.exit(1)