dobrokot/dobro_backup.py

## dobro_backup.py
#!/usr/bin/env python

USAGE = '''
Incremental backup program

Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'

Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN'
Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info.
Hardlinks of duplicated files created to previous directory, to save space/time.
Deletion of old backup directories is safe.

Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).

Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup)
'''

# Copyright (c) 2014 Ivan Yanikov

# The MIT License (MIT)

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.


#------------------------------------------------
# known missing features and issues:
#
# todo: allow to create hardlinks to old backups, to handle changed source set
#          ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ).
# todo: needless rehashes of internal file hash_dir_info.txt
# todo: catch "source inside destination" errors and vice-versa
# todo: allow to rename sources targets
# todo: allow to store hash_dir_info.txt outside source directories
# todo: allow exclude dirs/mask
# todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue )
#------------------------------------------------

import hashlib
import os, sys, shutil, stat
import time
import itertools
import re
import traceback

#recursively enumerates files in directory
#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
#cross-platform version for any OS
def _listfiles_py(path, relpath, result):
    if path:
        path += os.sep
    try:
        files = os.listdir(path)
    except EnvironmentError:
        traceback.print_exc()
        print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
        return result

    for fn in files:
        fullfn = path + fn
        relpath2 = (relpath + os.sep if relpath else '') + fn
        try:
            st = os.stat(fullfn)
        except EnvironmentError:
            traceback.print_exc()
            print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
            continue

        file_type = stat.S_IFMT(st.st_mode)
        if file_type == stat.S_IFDIR:
            _listfiles_py(fullfn, relpath2, result)
        elif file_type == stat.S_IFREG:
            result.append((relpath2, int(st.st_mtime), st.st_size))
        #soft-links are skipped
    return result

win32file = None
if (os.name == 'nt'):
    if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks
        import ctypes
        from ctypes import WinError
        from ctypes.wintypes import BOOL
        CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW
        CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p]
        CreateHardLink.restype = BOOL

        def mk_hard_link(src, dst):
            res = CreateHardLink(dst, src, None)
            if res == 0:
                raise WinError()

    FindFirstFile = ctypes.windll.kernel32.FindFirstFileW
    FindNextFile  = ctypes.windll.kernel32.FindNextFileW
    FindClose     = ctypes.windll.kernel32.FindClose
    GetLastError  =  ctypes.windll.kernel32.GetLastError
    #optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat)
    #enumerates non-recursive immediate files and subdirectories in directory
    #returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory )
    def FindFilesWin32(path):
        # pylint: disable=E0611
        from exceptions import WindowsError
        out  = ctypes.wintypes.WIN32_FIND_DATAW()
        fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out))

        if fldr == -1: #INVALID_HANDLE_VALUE
            raise WindowsError("can't list files in the %s" % repr(path))

        result = []
        try:
            while 1:
                fn = out.cFileName
                if fn != u'.' and fn != u'..':
                    isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY
                    ts = out.ftLastWriteTime
                    win_filetime = (ts.dwHighDateTime << 32) | ts.dwLowDateTime
                    secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970
                    timestamp = win_filetime / 10000000 - secs_between_epochs
                    size = (out.nFileSizeHigh << 32) | out.nFileSizeLow
                    result.append((out.cFileName, timestamp, size, isdir))

                if not FindNextFile(fldr, ctypes.byref(out)):
                    win_error = GetLastError()
                    if win_error ==  18: #ERROR_NO_MORE_FILES:
                        break
                    exc = WindowsError(win_error) #todo: test
                    exc.filename = path
                    raise exc
        finally:
            FindClose(fldr)
        return result

    #recursively enumerates files in directory
    #returns list of tuples ( relative_path, file_modified_timestamp, file_size )
    #specialized version for Windows.
    def ListFilesWin32(path, relpath, result):
        if path:
            path += os.sep
        try:
            files = FindFilesWin32(path)
        except EnvironmentError:
            traceback.print_exc()
            print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
            return result

        for (fn, ts, sz, isdir) in files:
            fullfn = path + fn
            relpath2 = (relpath + os.sep if relpath else '') + fn
            if isdir:
                ListFilesWin32(fullfn, relpath2, result)
            else:
                result.append((relpath2, ts, sz))
            #soft-links are skipped
        return result

    _listfiles = ListFilesWin32


else: # non-windows version of mk_hard_link and _listfiles
    mk_hard_link = os.link
    _listfiles = _listfiles_py

#recursively enumerates files in directory
#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
def listfiles(path):
    return _listfiles(unicode(path), u'', [])

def md5_for_file(filename):
    # pylint: disable=E1101
    md5 = hashlib.md5()
    with open(filename, 'rb') as f:
        while True:
            data = f.read(2**20)
            if not data:
                break
            md5.update(data)
        return md5.hexdigest()

#read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)}
def load_hashes(hf_txt):
    #different systems can have different file-time-modified format
    if not os.path.isfile(hf_txt):
        return {}

    try:
        res = {}
        with open(hf_txt, 'rb') as f:
            for l in f:
                md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3)
                res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime))
        return res
    except EnvironmentError:
        traceback.print_exc()
        return {}

# pretty format of file size, '81731298312' -> '76.1GB'
def filesize_fmt(num):
    for x in ['bytes', 'KB', 'MB', 'GB']:
        if num < 1024.0 and num > -1024.0:
            return "%3.1f%s" % (num, x)
        num /= 1024.0
    return "%3.1f%s" % (num, 'TB')

# recalculates hashes of files, timestamp of which changed
# also calls 'file_processed_callback' on each file (to
# have fresh data from OS file-system cache).
def get_hashes(dir, prev_info, file_processed_callback):
    print >> sys.stderr, 'info: traversing', repr(dir)
    current_files = listfiles(dir)

    result = []
    for relpath, mtime, size in current_files:
        prev_file_info = prev_info.get(relpath)
        md5 = None
        if prev_file_info:
            prev_md5, prev_size, prev_mtime = prev_file_info
            if prev_size == size and prev_mtime == mtime:
                md5 = prev_md5
        result.append((relpath, md5, size, mtime))

    recalc_count = 0
    recalc_size = 0
    total_size = 0
    for relpath, md5, size, mtime in result:
        total_size += size
        if md5 == None:
            recalc_count += 1
            recalc_size += size

    print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % (
        len(result), filesize_fmt(total_size),
        recalc_count, filesize_fmt(recalc_size))

    tmp = []
    #todo: make incremental, to allow restart from interrupted point
    for (relpath, md5, size, mtime) in result:
        if md5 == None:
            md5 = md5_for_file(os.path.join(dir, relpath))
        tmp.append((relpath, md5, size, mtime))
        if file_processed_callback:
            #after md5_for_file file is in the OS file cache, copy it immediately
            file_processed_callback(relpath, md5, size, mtime)
    result = tmp

    result.sort()
    return result

def write_hashes(hashes_data_list, hf_txt):
    with open(hf_txt + '.tmp', 'wb') as f:
        for relpath, md5, size, mtime in hashes_data_list:
            print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/'))
    if os.path.exists(hf_txt):
        os.unlink(hf_txt)
    os.rename(hf_txt + ".tmp", hf_txt)

# call 'get_hashes' on directory, and put in file with fresh hashes
# see also 'get_hashes' function
def hash_dir(dir, file_processed_callback = None):
    hf_txt = os.path.join(dir, 'hash_dir_info.txt')
    prev_info = load_hashes(hf_txt)

    with open(hf_txt + '.tmp', 'wb') as _f:
        #check we can open the file to write
        pass

    start = time.time()
    result = get_hashes(dir, prev_info, file_processed_callback)
    print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds'

    write_hashes(result, hf_txt)
    return result

# Put md5 hashes from dst/**/* to file dst/'hash_dir_info.txt'.
# for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5.
# Otherwise hash is recalculated
def write_dst_hashes(dst, dst_rel_path2md5):
    result = []
    for relpath, mtime, size in listfiles(dst):
        md5 = dst_rel_path2md5.get(relpath)
        if md5 == None:
            md5 = md5_for_file(os.path.join(dst, relpath))
        result.append((relpath, md5, size, mtime))
    result.sort()
    write_hashes(result, os.path.join(dst, 'hash_dir_info.txt'))

# Recursively deletes file tree,
# handles some real-life problems with unremovable files
def my_rmtree(dir):
    def remove_readonly(fn, path, _excinfo):
        os.chmod(path, stat.S_IWRITE) #handle read-only files
        fn(path)
    shutil.rmtree(unicode(dir), onerror=remove_readonly)

# some checks of invalid arguments and access rights
def backup_fast_checks(dst_prev, dst_tmp, srcs):
    assert dst_prev == None or os.path.isdir(dst_prev), dst_prev

    for src in srcs:
        if not os.path.isdir(src):
            print '"%s" is not a directory!' % src
            exit(1)

    open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close()
    os.unlink(os.path.join(dst_tmp, "test_dir_is_writable"))

    get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs])
    for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base):
        srcs_grouped = list(x[1] for x in srcs_grouped)
        if len(srcs_grouped) >= 2:
            raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped))

# get basename of file. Tolerate traling/// slashes and Windows drive letter.
def get_src_basename(x):
    assert os.path.normpath('xxx///') == 'xxx'
    if os.name == 'nt':
        assert os.path.normpath('x://\\//\\') == 'x:\\'
        assert os.path.normpath('xxx//\\//\\') == 'xxx'
    x = os.path.normpath(x).rstrip(os.sep)
    if os.name == 'nt' and x[-1:] == ':':
        x = x[0:-1] #remove ':' from "drive:"
    return os.path.basename(x)

#convert drive path to dir path on Windows
def win_drive_to_dir(x):
    if os.name == 'nt' and x[-1:] == ':':
        # http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows
        # path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\'
        x = x + '\\'
    return x

#load mapping from md5 to filename in previous backup (to hardlink them in future)
def load_prev_dst_hashes(dst_prev):
    print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev
    if (dst_prev == None):
        return {}
    dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt")
    if not os.path.isfile(dst_prev_hash_fn):
        print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn
    dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None)

    md5to_old = {}
    # avoid randomization of hardlinks to files with same content - sort by name, select last
    for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes):
        md5to_old[md5] = os.path.join(dst_prev, relpath)
    return md5to_old

# create hardlinks to dst_prev or copy srcs files to dst_tmp.
# if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c',
# then 'a/b/c' is copied to 'x/y/z/c'
def backup_impl(dst_prev, dst_tmp, srcs):
    srcs = [win_drive_to_dir(src) for src in srcs]
    if not os.path.isdir(dst_tmp):
        os.makedirs(dst_tmp)
    backup_fast_checks(dst_prev, dst_tmp, srcs)

    md5to_old = load_prev_dst_hashes(dst_prev)

    dst_rel_path2md5 = {}
    for src in srcs:
        print >> sys.stderr, 'info: copying src="%s"...' % src
        src_base = get_src_basename(src)
        current_dst = os.path.join(dst_tmp, src_base)

        if os.path.exists(current_dst):
            print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst)
            my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD
        if not os.path.isdir(current_dst):
            os.makedirs(current_dst)

        created_dirs = set() # don't check known directories
        def file_processed_callback(relpath, md5, size, _mtime):
            full_dst_fn = os.path.join(current_dst, relpath)
            full_dst_dir = os.path.dirname(full_dst_fn)
            if full_dst_dir not in created_dirs:
                if not os.path.isdir(full_dst_dir):
                    os.makedirs(full_dst_dir)
                created_dirs.add(full_dst_dir)
            try:
                if size == 0:
                    #NTFS hard-links limit = 1023, empty file is most frequent
                    open(full_dst_fn, 'wb').close()
                elif md5 in md5to_old:
                    #print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam
                    mk_hard_link(md5to_old[md5], full_dst_fn)
                else:
                    if dst_prev != None: #a lot of copies, when run first time
                        print >> sys.stderr, 'verbose: full copy', repr(relpath)
                    shutil.copy(os.path.join(src, relpath), full_dst_fn)
                dst_rel_path = os.path.join(src_base, relpath)
                dst_rel_path2md5[dst_rel_path] = md5
            except EnvironmentError:
                traceback.print_exc()
                print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath)

        hash_dir(src, file_processed_callback)

    write_dst_hashes(dst_tmp, dst_rel_path2md5)
    #todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks).

# calls 'backup_impl' function. See also 'backup_impl'.
# Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst'
def backup(dst_prev, dst, srcs):
    st = time.time()
    print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
    if os.path.exists(dst):
        print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst
        exit(1)

    backup_impl(dst_prev, dst + ".tmp", srcs)
    os.rename(dst + ".tmp", dst)

    backup_complete_file = os.path.join(dst, 'backup.complete')
    with open(backup_complete_file, 'wb') as f:
        print >> f, 'last backup time: "%s"' % time.time()
    print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
    print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st))

# select directory with most recent root/*/backup.complete file.
def select_last_previous_backup(root):
    root = win_drive_to_dir(root)
    alternatives = []
    if not os.path.isdir(root):
        print '"%s" is not a directory!' % root
        exit(1)
    for dir in os.listdir(root):
        dst_prev = os.path.join(root, dir)
        if os.path.isdir(dst_prev):
            backup_complete_file = os.path.join(dst_prev, 'backup.complete')
            if os.path.isfile(backup_complete_file):
                with open(backup_complete_file) as f:
                    try:
                        m = re.match('last backup time: "(.*)"$', f.read().strip())
                        backup_time = float(m.group(1))
                        alternatives.append((backup_time, dst_prev))
                    except:
                        traceback.print_exc()
                        print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file

    dst_prev = None
    if alternatives:
        _backup_time, dst_prev = max(alternatives)
        print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev
    else:
        print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root

    return dst_prev

def main():
    if len(sys.argv) < 3:
        prog_name = sys.argv[0] if sys.argv else ''
        print >> sys.stderr, USAGE.replace("prog_name", prog_name)
        exit(1)
    root = sys.argv[1]
    srcs = sys.argv[2:]
    dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S")
    backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs)

#compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time.
def my_listdir_functions_test():
    for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]:
        t = time.time()
        with open(out, 'wb') as f:
            path = u"c:"
            for relpath, mtime, size in lf(unicode(path), u'', []):
                relpath = relpath.replace('\\', '/').encode('UTF-8')
                if 'test/string/trunk/bin/backup' not in relpath: #my development source directory
                    print >> f, '%s\t%s\t%s' % (relpath, mtime, size)
        print >> sys.stderr, 'TIME:', lf, time.time() - t

if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	USAGE = '''
	Incremental backup program

	Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'

	Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN'
	Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
	When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info.
	Hardlinks of duplicated files created to previous directory, to save space/time.
	Deletion of old backup directories is safe.

	Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).

	Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup)
	'''

	# Copyright (c) 2014 Ivan Yanikov

	# The MIT License (MIT)

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.


	#------------------------------------------------
	# known missing features and issues:
	#
	# todo: allow to create hardlinks to old backups, to handle changed source set
	# ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ).
	# todo: needless rehashes of internal file hash_dir_info.txt
	# todo: catch "source inside destination" errors and vice-versa
	# todo: allow to rename sources targets
	# todo: allow to store hash_dir_info.txt outside source directories
	# todo: allow exclude dirs/mask
	# todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue )
	#------------------------------------------------

	import hashlib
	import os, sys, shutil, stat
	import time
	import itertools
	import re
	import traceback

	#recursively enumerates files in directory
	#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
	#cross-platform version for any OS
	def _listfiles_py(path, relpath, result):
	if path:
	path += os.sep
	try:
	files = os.listdir(path)
	except EnvironmentError:
	traceback.print_exc()
	print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
	return result

	for fn in files:
	fullfn = path + fn
	relpath2 = (relpath + os.sep if relpath else '') + fn
	try:
	st = os.stat(fullfn)
	except EnvironmentError:
	traceback.print_exc()
	print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
	continue

	file_type = stat.S_IFMT(st.st_mode)
	if file_type == stat.S_IFDIR:
	_listfiles_py(fullfn, relpath2, result)
	elif file_type == stat.S_IFREG:
	result.append((relpath2, int(st.st_mtime), st.st_size))
	#soft-links are skipped
	return result

	win32file = None
	if (os.name == 'nt'):
	if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks
	import ctypes
	from ctypes import WinError
	from ctypes.wintypes import BOOL
	CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW
	CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p]
	CreateHardLink.restype = BOOL

	def mk_hard_link(src, dst):
	res = CreateHardLink(dst, src, None)
	if res == 0:
	raise WinError()

	FindFirstFile = ctypes.windll.kernel32.FindFirstFileW
	FindNextFile = ctypes.windll.kernel32.FindNextFileW
	FindClose = ctypes.windll.kernel32.FindClose
	GetLastError = ctypes.windll.kernel32.GetLastError
	#optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat)
	#enumerates non-recursive immediate files and subdirectories in directory
	#returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory )
	def FindFilesWin32(path):
	# pylint: disable=E0611
	from exceptions import WindowsError
	out = ctypes.wintypes.WIN32_FIND_DATAW()
	fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out))

	if fldr == -1: #INVALID_HANDLE_VALUE
	raise WindowsError("can't list files in the %s" % repr(path))

	result = []
	try:
	while 1:
	fn = out.cFileName
	if fn != u'.' and fn != u'..':
	isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY
	ts = out.ftLastWriteTime
	win_filetime = (ts.dwHighDateTime << 32) \| ts.dwLowDateTime
	secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970
	timestamp = win_filetime / 10000000 - secs_between_epochs
	size = (out.nFileSizeHigh << 32) \| out.nFileSizeLow
	result.append((out.cFileName, timestamp, size, isdir))

	if not FindNextFile(fldr, ctypes.byref(out)):
	win_error = GetLastError()
	if win_error == 18: #ERROR_NO_MORE_FILES:
	break
	exc = WindowsError(win_error) #todo: test
	exc.filename = path
	raise exc
	finally:
	FindClose(fldr)
	return result

	#recursively enumerates files in directory
	#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
	#specialized version for Windows.
	def ListFilesWin32(path, relpath, result):
	if path:
	path += os.sep
	try:
	files = FindFilesWin32(path)
	except EnvironmentError:
	traceback.print_exc()
	print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
	return result

	for (fn, ts, sz, isdir) in files:
	fullfn = path + fn
	relpath2 = (relpath + os.sep if relpath else '') + fn
	if isdir:
	ListFilesWin32(fullfn, relpath2, result)
	else:
	result.append((relpath2, ts, sz))
	#soft-links are skipped
	return result

	_listfiles = ListFilesWin32


	else: # non-windows version of mk_hard_link and _listfiles
	mk_hard_link = os.link
	_listfiles = _listfiles_py

	#recursively enumerates files in directory
	#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
	def listfiles(path):
	return _listfiles(unicode(path), u'', [])

	def md5_for_file(filename):
	# pylint: disable=E1101
	md5 = hashlib.md5()
	with open(filename, 'rb') as f:
	while True:
	data = f.read(2**20)
	if not data:
	break
	md5.update(data)
	return md5.hexdigest()

	#read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)}
	def load_hashes(hf_txt):
	#different systems can have different file-time-modified format
	if not os.path.isfile(hf_txt):
	return {}

	try:
	res = {}
	with open(hf_txt, 'rb') as f:
	for l in f:
	md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3)
	res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime))
	return res
	except EnvironmentError:
	traceback.print_exc()
	return {}

	# pretty format of file size, '81731298312' -> '76.1GB'
	def filesize_fmt(num):
	for x in ['bytes', 'KB', 'MB', 'GB']:
	if num < 1024.0 and num > -1024.0:
	return "%3.1f%s" % (num, x)
	num /= 1024.0
	return "%3.1f%s" % (num, 'TB')

	# recalculates hashes of files, timestamp of which changed
	# also calls 'file_processed_callback' on each file (to
	# have fresh data from OS file-system cache).
	def get_hashes(dir, prev_info, file_processed_callback):
	print >> sys.stderr, 'info: traversing', repr(dir)
	current_files = listfiles(dir)

	result = []
	for relpath, mtime, size in current_files:
	prev_file_info = prev_info.get(relpath)
	md5 = None
	if prev_file_info:
	prev_md5, prev_size, prev_mtime = prev_file_info
	if prev_size == size and prev_mtime == mtime:
	md5 = prev_md5
	result.append((relpath, md5, size, mtime))

	recalc_count = 0
	recalc_size = 0
	total_size = 0
	for relpath, md5, size, mtime in result:
	total_size += size
	if md5 == None:
	recalc_count += 1
	recalc_size += size

	print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % (
	len(result), filesize_fmt(total_size),
	recalc_count, filesize_fmt(recalc_size))

	tmp = []
	#todo: make incremental, to allow restart from interrupted point
	for (relpath, md5, size, mtime) in result:
	if md5 == None:
	md5 = md5_for_file(os.path.join(dir, relpath))
	tmp.append((relpath, md5, size, mtime))
	if file_processed_callback:
	#after md5_for_file file is in the OS file cache, copy it immediately
	file_processed_callback(relpath, md5, size, mtime)
	result = tmp

	result.sort()
	return result

	def write_hashes(hashes_data_list, hf_txt):
	with open(hf_txt + '.tmp', 'wb') as f:
	for relpath, md5, size, mtime in hashes_data_list:
	print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/'))
	if os.path.exists(hf_txt):
	os.unlink(hf_txt)
	os.rename(hf_txt + ".tmp", hf_txt)

	# call 'get_hashes' on directory, and put in file with fresh hashes
	# see also 'get_hashes' function
	def hash_dir(dir, file_processed_callback = None):
	hf_txt = os.path.join(dir, 'hash_dir_info.txt')
	prev_info = load_hashes(hf_txt)

	with open(hf_txt + '.tmp', 'wb') as _f:
	#check we can open the file to write
	pass

	start = time.time()
	result = get_hashes(dir, prev_info, file_processed_callback)
	print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds'

	write_hashes(result, hf_txt)
	return result

	# Put md5 hashes from dst/*/ to file dst/'hash_dir_info.txt'.
	# for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5.
	# Otherwise hash is recalculated
	def write_dst_hashes(dst, dst_rel_path2md5):
	result = []
	for relpath, mtime, size in listfiles(dst):
	md5 = dst_rel_path2md5.get(relpath)
	if md5 == None:
	md5 = md5_for_file(os.path.join(dst, relpath))
	result.append((relpath, md5, size, mtime))
	result.sort()
	write_hashes(result, os.path.join(dst, 'hash_dir_info.txt'))

	# Recursively deletes file tree,
	# handles some real-life problems with unremovable files
	def my_rmtree(dir):
	def remove_readonly(fn, path, _excinfo):
	os.chmod(path, stat.S_IWRITE) #handle read-only files
	fn(path)
	shutil.rmtree(unicode(dir), onerror=remove_readonly)

	# some checks of invalid arguments and access rights
	def backup_fast_checks(dst_prev, dst_tmp, srcs):
	assert dst_prev == None or os.path.isdir(dst_prev), dst_prev

	for src in srcs:
	if not os.path.isdir(src):
	print '"%s" is not a directory!' % src
	exit(1)

	open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close()
	os.unlink(os.path.join(dst_tmp, "test_dir_is_writable"))

	get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs])
	for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base):
	srcs_grouped = list(x[1] for x in srcs_grouped)
	if len(srcs_grouped) >= 2:
	raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped))

	# get basename of file. Tolerate traling/// slashes and Windows drive letter.
	def get_src_basename(x):
	assert os.path.normpath('xxx///') == 'xxx'
	if os.name == 'nt':
	assert os.path.normpath('x://\\//\\') == 'x:\\'
	assert os.path.normpath('xxx//\\//\\') == 'xxx'
	x = os.path.normpath(x).rstrip(os.sep)
	if os.name == 'nt' and x[-1:] == ':':
	x = x[0:-1] #remove ':' from "drive:"
	return os.path.basename(x)

	#convert drive path to dir path on Windows
	def win_drive_to_dir(x):
	if os.name == 'nt' and x[-1:] == ':':
	# http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows
	# path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\'
	x = x + '\\'
	return x

	#load mapping from md5 to filename in previous backup (to hardlink them in future)
	def load_prev_dst_hashes(dst_prev):
	print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev
	if (dst_prev == None):
	return {}
	dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt")
	if not os.path.isfile(dst_prev_hash_fn):
	print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn
	dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None)

	md5to_old = {}
	# avoid randomization of hardlinks to files with same content - sort by name, select last
	for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes):
	md5to_old[md5] = os.path.join(dst_prev, relpath)
	return md5to_old

	# create hardlinks to dst_prev or copy srcs files to dst_tmp.
	# if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c',
	# then 'a/b/c' is copied to 'x/y/z/c'
	def backup_impl(dst_prev, dst_tmp, srcs):
	srcs = [win_drive_to_dir(src) for src in srcs]
	if not os.path.isdir(dst_tmp):
	os.makedirs(dst_tmp)
	backup_fast_checks(dst_prev, dst_tmp, srcs)

	md5to_old = load_prev_dst_hashes(dst_prev)

	dst_rel_path2md5 = {}
	for src in srcs:
	print >> sys.stderr, 'info: copying src="%s"...' % src
	src_base = get_src_basename(src)
	current_dst = os.path.join(dst_tmp, src_base)

	if os.path.exists(current_dst):
	print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst)
	my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD
	if not os.path.isdir(current_dst):
	os.makedirs(current_dst)

	created_dirs = set() # don't check known directories
	def file_processed_callback(relpath, md5, size, _mtime):
	full_dst_fn = os.path.join(current_dst, relpath)
	full_dst_dir = os.path.dirname(full_dst_fn)
	if full_dst_dir not in created_dirs:
	if not os.path.isdir(full_dst_dir):
	os.makedirs(full_dst_dir)
	created_dirs.add(full_dst_dir)
	try:
	if size == 0:
	#NTFS hard-links limit = 1023, empty file is most frequent
	open(full_dst_fn, 'wb').close()
	elif md5 in md5to_old:
	#print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam
	mk_hard_link(md5to_old[md5], full_dst_fn)
	else:
	if dst_prev != None: #a lot of copies, when run first time
	print >> sys.stderr, 'verbose: full copy', repr(relpath)
	shutil.copy(os.path.join(src, relpath), full_dst_fn)
	dst_rel_path = os.path.join(src_base, relpath)
	dst_rel_path2md5[dst_rel_path] = md5
	except EnvironmentError:
	traceback.print_exc()
	print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath)

	hash_dir(src, file_processed_callback)

	write_dst_hashes(dst_tmp, dst_rel_path2md5)
	#todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks).

	# calls 'backup_impl' function. See also 'backup_impl'.
	# Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst'
	def backup(dst_prev, dst, srcs):
	st = time.time()
	print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
	if os.path.exists(dst):
	print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst
	exit(1)

	backup_impl(dst_prev, dst + ".tmp", srcs)
	os.rename(dst + ".tmp", dst)

	backup_complete_file = os.path.join(dst, 'backup.complete')
	with open(backup_complete_file, 'wb') as f:
	print >> f, 'last backup time: "%s"' % time.time()
	print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
	print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st))

	# select directory with most recent root/*/backup.complete file.
	def select_last_previous_backup(root):
	root = win_drive_to_dir(root)
	alternatives = []
	if not os.path.isdir(root):
	print '"%s" is not a directory!' % root
	exit(1)
	for dir in os.listdir(root):
	dst_prev = os.path.join(root, dir)
	if os.path.isdir(dst_prev):
	backup_complete_file = os.path.join(dst_prev, 'backup.complete')
	if os.path.isfile(backup_complete_file):
	with open(backup_complete_file) as f:
	try:
	m = re.match('last backup time: "(.*)"$', f.read().strip())
	backup_time = float(m.group(1))
	alternatives.append((backup_time, dst_prev))
	except:
	traceback.print_exc()
	print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file

	dst_prev = None
	if alternatives:
	_backup_time, dst_prev = max(alternatives)
	print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev
	else:
	print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root

	return dst_prev

	def main():
	if len(sys.argv) < 3:
	prog_name = sys.argv[0] if sys.argv else ''
	print >> sys.stderr, USAGE.replace("prog_name", prog_name)
	exit(1)
	root = sys.argv[1]
	srcs = sys.argv[2:]
	dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S")
	backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs)

	#compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time.
	def my_listdir_functions_test():
	for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]:
	t = time.time()
	with open(out, 'wb') as f:
	path = u"c:"
	for relpath, mtime, size in lf(unicode(path), u'', []):
	relpath = relpath.replace('\\', '/').encode('UTF-8')
	if 'test/string/trunk/bin/backup' not in relpath: #my development source directory
	print >> f, '%s\t%s\t%s' % (relpath, mtime, size)
	print >> sys.stderr, 'TIME:', lf, time.time() - t

	if __name__ == "__main__":
	main()