dobro_backup.py - fast incremental backup
#!/usr/bin/env python | |
USAGE = ''' | |
Incremental backup program | |
Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN' | |
Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN' | |
Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)' | |
When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info. | |
Hardlinks of duplicated files created to previous directory, to save space/time. | |
Deletion of old backup directories is safe. | |
Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example). | |
Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup) | |
''' | |
# Copyright (c) 2014 Ivan Yanikov | |
# The MIT License (MIT) | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THE SOFTWARE. | |
#------------------------------------------------ | |
# known missing features and issues: | |
# | |
# todo: allow to create hardlinks to old backups, to handle changed source set | |
# ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ). | |
# todo: needless rehashes of internal file hash_dir_info.txt | |
# todo: catch "source inside destination" errors and vice-versa | |
# todo: allow to rename sources targets | |
# todo: allow to store hash_dir_info.txt outside source directories | |
# todo: allow exclude dirs/mask | |
# todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue ) | |
#------------------------------------------------ | |
import hashlib | |
import os, sys, shutil, stat | |
import time | |
import itertools | |
import re | |
import traceback | |
#recursively enumerates files in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size ) | |
#cross-platform version for any OS | |
def _listfiles_py(path, relpath, result): | |
if path: | |
path += os.sep | |
try: | |
files = os.listdir(path) | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"' | |
return result | |
for fn in files: | |
fullfn = path + fn | |
relpath2 = (relpath + os.sep if relpath else '') + fn | |
try: | |
st = os.stat(fullfn) | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"' | |
continue | |
file_type = stat.S_IFMT(st.st_mode) | |
if file_type == stat.S_IFDIR: | |
_listfiles_py(fullfn, relpath2, result) | |
elif file_type == stat.S_IFREG: | |
result.append((relpath2, int(st.st_mtime), st.st_size)) | |
#soft-links are skipped | |
return result | |
win32file = None | |
if (os.name == 'nt'): | |
if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks | |
import ctypes | |
from ctypes import WinError | |
from ctypes.wintypes import BOOL | |
CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW | |
CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p] | |
CreateHardLink.restype = BOOL | |
def mk_hard_link(src, dst): | |
res = CreateHardLink(dst, src, None) | |
if res == 0: | |
raise WinError() | |
FindFirstFile = ctypes.windll.kernel32.FindFirstFileW | |
FindNextFile = ctypes.windll.kernel32.FindNextFileW | |
FindClose = ctypes.windll.kernel32.FindClose | |
GetLastError = ctypes.windll.kernel32.GetLastError | |
#optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat) | |
#enumerates non-recursive immediate files and subdirectories in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory ) | |
def FindFilesWin32(path): | |
# pylint: disable=E0611 | |
from exceptions import WindowsError | |
out = ctypes.wintypes.WIN32_FIND_DATAW() | |
fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out)) | |
if fldr == -1: #INVALID_HANDLE_VALUE | |
raise WindowsError("can't list files in the %s" % repr(path)) | |
result = [] | |
try: | |
while 1: | |
fn = out.cFileName | |
if fn != u'.' and fn != u'..': | |
isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY | |
ts = out.ftLastWriteTime | |
win_filetime = (ts.dwHighDateTime << 32) | ts.dwLowDateTime | |
secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970 | |
timestamp = win_filetime / 10000000 - secs_between_epochs | |
size = (out.nFileSizeHigh << 32) | out.nFileSizeLow | |
result.append((out.cFileName, timestamp, size, isdir)) | |
if not FindNextFile(fldr, ctypes.byref(out)): | |
win_error = GetLastError() | |
if win_error == 18: #ERROR_NO_MORE_FILES: | |
break | |
exc = WindowsError(win_error) #todo: test | |
exc.filename = path | |
raise exc | |
finally: | |
FindClose(fldr) | |
return result | |
#recursively enumerates files in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size ) | |
#specialized version for Windows. | |
def ListFilesWin32(path, relpath, result): | |
if path: | |
path += os.sep | |
try: | |
files = FindFilesWin32(path) | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"' | |
return result | |
for (fn, ts, sz, isdir) in files: | |
fullfn = path + fn | |
relpath2 = (relpath + os.sep if relpath else '') + fn | |
if isdir: | |
ListFilesWin32(fullfn, relpath2, result) | |
else: | |
result.append((relpath2, ts, sz)) | |
#soft-links are skipped | |
return result | |
_listfiles = ListFilesWin32 | |
else: # non-windows version of mk_hard_link and _listfiles | |
mk_hard_link = os.link | |
_listfiles = _listfiles_py | |
#recursively enumerates files in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size ) | |
def listfiles(path): | |
return _listfiles(unicode(path), u'', []) | |
def md5_for_file(filename): | |
# pylint: disable=E1101 | |
md5 = hashlib.md5() | |
with open(filename, 'rb') as f: | |
while True: | |
data = f.read(2**20) | |
if not data: | |
break | |
md5.update(data) | |
return md5.hexdigest() | |
#read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)} | |
def load_hashes(hf_txt): | |
#different systems can have different file-time-modified format | |
if not os.path.isfile(hf_txt): | |
return {} | |
try: | |
res = {} | |
with open(hf_txt, 'rb') as f: | |
for l in f: | |
md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3) | |
res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime)) | |
return res | |
except EnvironmentError: | |
traceback.print_exc() | |
return {} | |
# pretty format of file size, '81731298312' -> '76.1GB' | |
def filesize_fmt(num): | |
for x in ['bytes', 'KB', 'MB', 'GB']: | |
if num < 1024.0 and num > -1024.0: | |
return "%3.1f%s" % (num, x) | |
num /= 1024.0 | |
return "%3.1f%s" % (num, 'TB') | |
# recalculates hashes of files, timestamp of which changed | |
# also calls 'file_processed_callback' on each file (to | |
# have fresh data from OS file-system cache). | |
def get_hashes(dir, prev_info, file_processed_callback): | |
print >> sys.stderr, 'info: traversing', repr(dir) | |
current_files = listfiles(dir) | |
result = [] | |
for relpath, mtime, size in current_files: | |
prev_file_info = prev_info.get(relpath) | |
md5 = None | |
if prev_file_info: | |
prev_md5, prev_size, prev_mtime = prev_file_info | |
if prev_size == size and prev_mtime == mtime: | |
md5 = prev_md5 | |
result.append((relpath, md5, size, mtime)) | |
recalc_count = 0 | |
recalc_size = 0 | |
total_size = 0 | |
for relpath, md5, size, mtime in result: | |
total_size += size | |
if md5 == None: | |
recalc_count += 1 | |
recalc_size += size | |
print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % ( | |
len(result), filesize_fmt(total_size), | |
recalc_count, filesize_fmt(recalc_size)) | |
tmp = [] | |
#todo: make incremental, to allow restart from interrupted point | |
for (relpath, md5, size, mtime) in result: | |
if md5 == None: | |
md5 = md5_for_file(os.path.join(dir, relpath)) | |
tmp.append((relpath, md5, size, mtime)) | |
if file_processed_callback: | |
#after md5_for_file file is in the OS file cache, copy it immediately | |
file_processed_callback(relpath, md5, size, mtime) | |
result = tmp | |
result.sort() | |
return result | |
def write_hashes(hashes_data_list, hf_txt): | |
with open(hf_txt + '.tmp', 'wb') as f: | |
for relpath, md5, size, mtime in hashes_data_list: | |
print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/')) | |
if os.path.exists(hf_txt): | |
os.unlink(hf_txt) | |
os.rename(hf_txt + ".tmp", hf_txt) | |
# call 'get_hashes' on directory, and put in file with fresh hashes | |
# see also 'get_hashes' function | |
def hash_dir(dir, file_processed_callback = None): | |
hf_txt = os.path.join(dir, 'hash_dir_info.txt') | |
prev_info = load_hashes(hf_txt) | |
with open(hf_txt + '.tmp', 'wb') as _f: | |
#check we can open the file to write | |
pass | |
start = time.time() | |
result = get_hashes(dir, prev_info, file_processed_callback) | |
print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds' | |
write_hashes(result, hf_txt) | |
return result | |
# Put md5 hashes from dst/**/* to file dst/'hash_dir_info.txt'. | |
# for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5. | |
# Otherwise hash is recalculated | |
def write_dst_hashes(dst, dst_rel_path2md5): | |
result = [] | |
for relpath, mtime, size in listfiles(dst): | |
md5 = dst_rel_path2md5.get(relpath) | |
if md5 == None: | |
md5 = md5_for_file(os.path.join(dst, relpath)) | |
result.append((relpath, md5, size, mtime)) | |
result.sort() | |
write_hashes(result, os.path.join(dst, 'hash_dir_info.txt')) | |
# Recursively deletes file tree, | |
# handles some real-life problems with unremovable files | |
def my_rmtree(dir): | |
def remove_readonly(fn, path, _excinfo): | |
os.chmod(path, stat.S_IWRITE) #handle read-only files | |
fn(path) | |
shutil.rmtree(unicode(dir), onerror=remove_readonly) | |
# some checks of invalid arguments and access rights | |
def backup_fast_checks(dst_prev, dst_tmp, srcs): | |
assert dst_prev == None or os.path.isdir(dst_prev), dst_prev | |
for src in srcs: | |
if not os.path.isdir(src): | |
print '"%s" is not a directory!' % src | |
exit(1) | |
open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close() | |
os.unlink(os.path.join(dst_tmp, "test_dir_is_writable")) | |
get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs]) | |
for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base): | |
srcs_grouped = list(x[1] for x in srcs_grouped) | |
if len(srcs_grouped) >= 2: | |
raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped)) | |
# get basename of file. Tolerate traling/// slashes and Windows drive letter. | |
def get_src_basename(x): | |
assert os.path.normpath('xxx///') == 'xxx' | |
if os.name == 'nt': | |
assert os.path.normpath('x://\\//\\') == 'x:\\' | |
assert os.path.normpath('xxx//\\//\\') == 'xxx' | |
x = os.path.normpath(x).rstrip(os.sep) | |
if os.name == 'nt' and x[-1:] == ':': | |
x = x[0:-1] #remove ':' from "drive:" | |
return os.path.basename(x) | |
#convert drive path to dir path on Windows | |
def win_drive_to_dir(x): | |
if os.name == 'nt' and x[-1:] == ':': | |
# http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows | |
# path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\' | |
x = x + '\\' | |
return x | |
#load mapping from md5 to filename in previous backup (to hardlink them in future) | |
def load_prev_dst_hashes(dst_prev): | |
print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev | |
if (dst_prev == None): | |
return {} | |
dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt") | |
if not os.path.isfile(dst_prev_hash_fn): | |
print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn | |
dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None) | |
md5to_old = {} | |
# avoid randomization of hardlinks to files with same content - sort by name, select last | |
for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes): | |
md5to_old[md5] = os.path.join(dst_prev, relpath) | |
return md5to_old | |
# create hardlinks to dst_prev or copy srcs files to dst_tmp. | |
# if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c', | |
# then 'a/b/c' is copied to 'x/y/z/c' | |
def backup_impl(dst_prev, dst_tmp, srcs): | |
srcs = [win_drive_to_dir(src) for src in srcs] | |
if not os.path.isdir(dst_tmp): | |
os.makedirs(dst_tmp) | |
backup_fast_checks(dst_prev, dst_tmp, srcs) | |
md5to_old = load_prev_dst_hashes(dst_prev) | |
dst_rel_path2md5 = {} | |
for src in srcs: | |
print >> sys.stderr, 'info: copying src="%s"...' % src | |
src_base = get_src_basename(src) | |
current_dst = os.path.join(dst_tmp, src_base) | |
if os.path.exists(current_dst): | |
print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst) | |
my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD | |
if not os.path.isdir(current_dst): | |
os.makedirs(current_dst) | |
created_dirs = set() # don't check known directories | |
def file_processed_callback(relpath, md5, size, _mtime): | |
full_dst_fn = os.path.join(current_dst, relpath) | |
full_dst_dir = os.path.dirname(full_dst_fn) | |
if full_dst_dir not in created_dirs: | |
if not os.path.isdir(full_dst_dir): | |
os.makedirs(full_dst_dir) | |
created_dirs.add(full_dst_dir) | |
try: | |
if size == 0: | |
#NTFS hard-links limit = 1023, empty file is most frequent | |
open(full_dst_fn, 'wb').close() | |
elif md5 in md5to_old: | |
#print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam | |
mk_hard_link(md5to_old[md5], full_dst_fn) | |
else: | |
if dst_prev != None: #a lot of copies, when run first time | |
print >> sys.stderr, 'verbose: full copy', repr(relpath) | |
shutil.copy(os.path.join(src, relpath), full_dst_fn) | |
dst_rel_path = os.path.join(src_base, relpath) | |
dst_rel_path2md5[dst_rel_path] = md5 | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath) | |
hash_dir(src, file_processed_callback) | |
write_dst_hashes(dst_tmp, dst_rel_path2md5) | |
#todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks). | |
# calls 'backup_impl' function. See also 'backup_impl'. | |
# Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst' | |
def backup(dst_prev, dst, srcs): | |
st = time.time() | |
print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs) | |
if os.path.exists(dst): | |
print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst | |
exit(1) | |
backup_impl(dst_prev, dst + ".tmp", srcs) | |
os.rename(dst + ".tmp", dst) | |
backup_complete_file = os.path.join(dst, 'backup.complete') | |
with open(backup_complete_file, 'wb') as f: | |
print >> f, 'last backup time: "%s"' % time.time() | |
print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs) | |
print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st)) | |
# select directory with most recent root/*/backup.complete file. | |
def select_last_previous_backup(root): | |
root = win_drive_to_dir(root) | |
alternatives = [] | |
if not os.path.isdir(root): | |
print '"%s" is not a directory!' % root | |
exit(1) | |
for dir in os.listdir(root): | |
dst_prev = os.path.join(root, dir) | |
if os.path.isdir(dst_prev): | |
backup_complete_file = os.path.join(dst_prev, 'backup.complete') | |
if os.path.isfile(backup_complete_file): | |
with open(backup_complete_file) as f: | |
try: | |
m = re.match('last backup time: "(.*)"$', f.read().strip()) | |
backup_time = float(m.group(1)) | |
alternatives.append((backup_time, dst_prev)) | |
except: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file | |
dst_prev = None | |
if alternatives: | |
_backup_time, dst_prev = max(alternatives) | |
print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev | |
else: | |
print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root | |
return dst_prev | |
def main(): | |
if len(sys.argv) < 3: | |
prog_name = sys.argv[0] if sys.argv else '' | |
print >> sys.stderr, USAGE.replace("prog_name", prog_name) | |
exit(1) | |
root = sys.argv[1] | |
srcs = sys.argv[2:] | |
dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S") | |
backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs) | |
#compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time. | |
def my_listdir_functions_test(): | |
for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]: | |
t = time.time() | |
with open(out, 'wb') as f: | |
path = u"c:" | |
for relpath, mtime, size in lf(unicode(path), u'', []): | |
relpath = relpath.replace('\\', '/').encode('UTF-8') | |
if 'test/string/trunk/bin/backup' not in relpath: #my development source directory | |
print >> f, '%s\t%s\t%s' % (relpath, mtime, size) | |
print >> sys.stderr, 'TIME:', lf, time.time() - t | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment