Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@dobrokot
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dobrokot/e552315e9acfac2359e7 to your computer and use it in GitHub Desktop.
Save dobrokot/e552315e9acfac2359e7 to your computer and use it in GitHub Desktop.
dobro_backup.py - fast incremental backup
#!/usr/bin/env python
USAGE = '''
Incremental backup program
Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'
Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN'
Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info.
Hardlinks of duplicated files created to previous directory, to save space/time.
Deletion of old backup directories is safe.
Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).
Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup)
'''
# Copyright (c) 2014 Ivan Yanikov
# The MIT License (MIT)
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#------------------------------------------------
# known missing features and issues:
#
# todo: allow to create hardlinks to old backups, to handle changed source set
# ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ).
# todo: needless rehashes of internal file hash_dir_info.txt
# todo: catch "source inside destination" errors and vice-versa
# todo: allow to rename sources targets
# todo: allow to store hash_dir_info.txt outside source directories
# todo: allow exclude dirs/mask
# todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue )
#------------------------------------------------
import hashlib
import os, sys, shutil, stat
import time
import itertools
import re
import traceback
#recursively enumerates files in directory
#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
#cross-platform version for any OS
def _listfiles_py(path, relpath, result):
if path:
path += os.sep
try:
files = os.listdir(path)
except EnvironmentError:
traceback.print_exc()
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
return result
for fn in files:
fullfn = path + fn
relpath2 = (relpath + os.sep if relpath else '') + fn
try:
st = os.stat(fullfn)
except EnvironmentError:
traceback.print_exc()
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
continue
file_type = stat.S_IFMT(st.st_mode)
if file_type == stat.S_IFDIR:
_listfiles_py(fullfn, relpath2, result)
elif file_type == stat.S_IFREG:
result.append((relpath2, int(st.st_mtime), st.st_size))
#soft-links are skipped
return result
win32file = None
if (os.name == 'nt'):
if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks
import ctypes
from ctypes import WinError
from ctypes.wintypes import BOOL
CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW
CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p]
CreateHardLink.restype = BOOL
def mk_hard_link(src, dst):
res = CreateHardLink(dst, src, None)
if res == 0:
raise WinError()
FindFirstFile = ctypes.windll.kernel32.FindFirstFileW
FindNextFile = ctypes.windll.kernel32.FindNextFileW
FindClose = ctypes.windll.kernel32.FindClose
GetLastError = ctypes.windll.kernel32.GetLastError
#optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat)
#enumerates non-recursive immediate files and subdirectories in directory
#returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory )
def FindFilesWin32(path):
# pylint: disable=E0611
from exceptions import WindowsError
out = ctypes.wintypes.WIN32_FIND_DATAW()
fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out))
if fldr == -1: #INVALID_HANDLE_VALUE
raise WindowsError("can't list files in the %s" % repr(path))
result = []
try:
while 1:
fn = out.cFileName
if fn != u'.' and fn != u'..':
isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY
ts = out.ftLastWriteTime
win_filetime = (ts.dwHighDateTime << 32) | ts.dwLowDateTime
secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970
timestamp = win_filetime / 10000000 - secs_between_epochs
size = (out.nFileSizeHigh << 32) | out.nFileSizeLow
result.append((out.cFileName, timestamp, size, isdir))
if not FindNextFile(fldr, ctypes.byref(out)):
win_error = GetLastError()
if win_error == 18: #ERROR_NO_MORE_FILES:
break
exc = WindowsError(win_error) #todo: test
exc.filename = path
raise exc
finally:
FindClose(fldr)
return result
#recursively enumerates files in directory
#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
#specialized version for Windows.
def ListFilesWin32(path, relpath, result):
if path:
path += os.sep
try:
files = FindFilesWin32(path)
except EnvironmentError:
traceback.print_exc()
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
return result
for (fn, ts, sz, isdir) in files:
fullfn = path + fn
relpath2 = (relpath + os.sep if relpath else '') + fn
if isdir:
ListFilesWin32(fullfn, relpath2, result)
else:
result.append((relpath2, ts, sz))
#soft-links are skipped
return result
_listfiles = ListFilesWin32
else: # non-windows version of mk_hard_link and _listfiles
mk_hard_link = os.link
_listfiles = _listfiles_py
#recursively enumerates files in directory
#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
def listfiles(path):
return _listfiles(unicode(path), u'', [])
def md5_for_file(filename):
# pylint: disable=E1101
md5 = hashlib.md5()
with open(filename, 'rb') as f:
while True:
data = f.read(2**20)
if not data:
break
md5.update(data)
return md5.hexdigest()
#read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)}
def load_hashes(hf_txt):
#different systems can have different file-time-modified format
if not os.path.isfile(hf_txt):
return {}
try:
res = {}
with open(hf_txt, 'rb') as f:
for l in f:
md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3)
res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime))
return res
except EnvironmentError:
traceback.print_exc()
return {}
# pretty format of file size, '81731298312' -> '76.1GB'
def filesize_fmt(num):
for x in ['bytes', 'KB', 'MB', 'GB']:
if num < 1024.0 and num > -1024.0:
return "%3.1f%s" % (num, x)
num /= 1024.0
return "%3.1f%s" % (num, 'TB')
# recalculates hashes of files, timestamp of which changed
# also calls 'file_processed_callback' on each file (to
# have fresh data from OS file-system cache).
def get_hashes(dir, prev_info, file_processed_callback):
print >> sys.stderr, 'info: traversing', repr(dir)
current_files = listfiles(dir)
result = []
for relpath, mtime, size in current_files:
prev_file_info = prev_info.get(relpath)
md5 = None
if prev_file_info:
prev_md5, prev_size, prev_mtime = prev_file_info
if prev_size == size and prev_mtime == mtime:
md5 = prev_md5
result.append((relpath, md5, size, mtime))
recalc_count = 0
recalc_size = 0
total_size = 0
for relpath, md5, size, mtime in result:
total_size += size
if md5 == None:
recalc_count += 1
recalc_size += size
print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % (
len(result), filesize_fmt(total_size),
recalc_count, filesize_fmt(recalc_size))
tmp = []
#todo: make incremental, to allow restart from interrupted point
for (relpath, md5, size, mtime) in result:
if md5 == None:
md5 = md5_for_file(os.path.join(dir, relpath))
tmp.append((relpath, md5, size, mtime))
if file_processed_callback:
#after md5_for_file file is in the OS file cache, copy it immediately
file_processed_callback(relpath, md5, size, mtime)
result = tmp
result.sort()
return result
def write_hashes(hashes_data_list, hf_txt):
with open(hf_txt + '.tmp', 'wb') as f:
for relpath, md5, size, mtime in hashes_data_list:
print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/'))
if os.path.exists(hf_txt):
os.unlink(hf_txt)
os.rename(hf_txt + ".tmp", hf_txt)
# call 'get_hashes' on directory, and put in file with fresh hashes
# see also 'get_hashes' function
def hash_dir(dir, file_processed_callback = None):
hf_txt = os.path.join(dir, 'hash_dir_info.txt')
prev_info = load_hashes(hf_txt)
with open(hf_txt + '.tmp', 'wb') as _f:
#check we can open the file to write
pass
start = time.time()
result = get_hashes(dir, prev_info, file_processed_callback)
print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds'
write_hashes(result, hf_txt)
return result
# Put md5 hashes from dst/**/* to file dst/'hash_dir_info.txt'.
# for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5.
# Otherwise hash is recalculated
def write_dst_hashes(dst, dst_rel_path2md5):
result = []
for relpath, mtime, size in listfiles(dst):
md5 = dst_rel_path2md5.get(relpath)
if md5 == None:
md5 = md5_for_file(os.path.join(dst, relpath))
result.append((relpath, md5, size, mtime))
result.sort()
write_hashes(result, os.path.join(dst, 'hash_dir_info.txt'))
# Recursively deletes file tree,
# handles some real-life problems with unremovable files
def my_rmtree(dir):
def remove_readonly(fn, path, _excinfo):
os.chmod(path, stat.S_IWRITE) #handle read-only files
fn(path)
shutil.rmtree(unicode(dir), onerror=remove_readonly)
# some checks of invalid arguments and access rights
def backup_fast_checks(dst_prev, dst_tmp, srcs):
assert dst_prev == None or os.path.isdir(dst_prev), dst_prev
for src in srcs:
if not os.path.isdir(src):
print '"%s" is not a directory!' % src
exit(1)
open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close()
os.unlink(os.path.join(dst_tmp, "test_dir_is_writable"))
get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs])
for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base):
srcs_grouped = list(x[1] for x in srcs_grouped)
if len(srcs_grouped) >= 2:
raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped))
# get basename of file. Tolerate traling/// slashes and Windows drive letter.
def get_src_basename(x):
assert os.path.normpath('xxx///') == 'xxx'
if os.name == 'nt':
assert os.path.normpath('x://\\//\\') == 'x:\\'
assert os.path.normpath('xxx//\\//\\') == 'xxx'
x = os.path.normpath(x).rstrip(os.sep)
if os.name == 'nt' and x[-1:] == ':':
x = x[0:-1] #remove ':' from "drive:"
return os.path.basename(x)
#convert drive path to dir path on Windows
def win_drive_to_dir(x):
if os.name == 'nt' and x[-1:] == ':':
# http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows
# path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\'
x = x + '\\'
return x
#load mapping from md5 to filename in previous backup (to hardlink them in future)
def load_prev_dst_hashes(dst_prev):
print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev
if (dst_prev == None):
return {}
dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt")
if not os.path.isfile(dst_prev_hash_fn):
print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn
dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None)
md5to_old = {}
# avoid randomization of hardlinks to files with same content - sort by name, select last
for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes):
md5to_old[md5] = os.path.join(dst_prev, relpath)
return md5to_old
# create hardlinks to dst_prev or copy srcs files to dst_tmp.
# if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c',
# then 'a/b/c' is copied to 'x/y/z/c'
def backup_impl(dst_prev, dst_tmp, srcs):
srcs = [win_drive_to_dir(src) for src in srcs]
if not os.path.isdir(dst_tmp):
os.makedirs(dst_tmp)
backup_fast_checks(dst_prev, dst_tmp, srcs)
md5to_old = load_prev_dst_hashes(dst_prev)
dst_rel_path2md5 = {}
for src in srcs:
print >> sys.stderr, 'info: copying src="%s"...' % src
src_base = get_src_basename(src)
current_dst = os.path.join(dst_tmp, src_base)
if os.path.exists(current_dst):
print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst)
my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD
if not os.path.isdir(current_dst):
os.makedirs(current_dst)
created_dirs = set() # don't check known directories
def file_processed_callback(relpath, md5, size, _mtime):
full_dst_fn = os.path.join(current_dst, relpath)
full_dst_dir = os.path.dirname(full_dst_fn)
if full_dst_dir not in created_dirs:
if not os.path.isdir(full_dst_dir):
os.makedirs(full_dst_dir)
created_dirs.add(full_dst_dir)
try:
if size == 0:
#NTFS hard-links limit = 1023, empty file is most frequent
open(full_dst_fn, 'wb').close()
elif md5 in md5to_old:
#print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam
mk_hard_link(md5to_old[md5], full_dst_fn)
else:
if dst_prev != None: #a lot of copies, when run first time
print >> sys.stderr, 'verbose: full copy', repr(relpath)
shutil.copy(os.path.join(src, relpath), full_dst_fn)
dst_rel_path = os.path.join(src_base, relpath)
dst_rel_path2md5[dst_rel_path] = md5
except EnvironmentError:
traceback.print_exc()
print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath)
hash_dir(src, file_processed_callback)
write_dst_hashes(dst_tmp, dst_rel_path2md5)
#todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks).
# calls 'backup_impl' function. See also 'backup_impl'.
# Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst'
def backup(dst_prev, dst, srcs):
st = time.time()
print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
if os.path.exists(dst):
print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst
exit(1)
backup_impl(dst_prev, dst + ".tmp", srcs)
os.rename(dst + ".tmp", dst)
backup_complete_file = os.path.join(dst, 'backup.complete')
with open(backup_complete_file, 'wb') as f:
print >> f, 'last backup time: "%s"' % time.time()
print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st))
# select directory with most recent root/*/backup.complete file.
def select_last_previous_backup(root):
root = win_drive_to_dir(root)
alternatives = []
if not os.path.isdir(root):
print '"%s" is not a directory!' % root
exit(1)
for dir in os.listdir(root):
dst_prev = os.path.join(root, dir)
if os.path.isdir(dst_prev):
backup_complete_file = os.path.join(dst_prev, 'backup.complete')
if os.path.isfile(backup_complete_file):
with open(backup_complete_file) as f:
try:
m = re.match('last backup time: "(.*)"$', f.read().strip())
backup_time = float(m.group(1))
alternatives.append((backup_time, dst_prev))
except:
traceback.print_exc()
print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file
dst_prev = None
if alternatives:
_backup_time, dst_prev = max(alternatives)
print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev
else:
print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root
return dst_prev
def main():
if len(sys.argv) < 3:
prog_name = sys.argv[0] if sys.argv else ''
print >> sys.stderr, USAGE.replace("prog_name", prog_name)
exit(1)
root = sys.argv[1]
srcs = sys.argv[2:]
dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S")
backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs)
#compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time.
def my_listdir_functions_test():
for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]:
t = time.time()
with open(out, 'wb') as f:
path = u"c:"
for relpath, mtime, size in lf(unicode(path), u'', []):
relpath = relpath.replace('\\', '/').encode('UTF-8')
if 'test/string/trunk/bin/backup' not in relpath: #my development source directory
print >> f, '%s\t%s\t%s' % (relpath, mtime, size)
print >> sys.stderr, 'TIME:', lf, time.time() - t
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment