Last active
August 29, 2015 14:02
-
-
Save dobrokot/e552315e9acfac2359e7 to your computer and use it in GitHub Desktop.
dobro_backup.py - fast incremental backup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
USAGE = ''' | |
Incremental backup program | |
Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN' | |
Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN' | |
Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)' | |
When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info. | |
Hardlinks of duplicated files created to previous directory, to save space/time. | |
Deletion of old backup directories is safe. | |
Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example). | |
Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup) | |
''' | |
# Copyright (c) 2014 Ivan Yanikov | |
# The MIT License (MIT) | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
# THE SOFTWARE. | |
#------------------------------------------------ | |
# known missing features and issues: | |
# | |
# todo: allow to create hardlinks to old backups, to handle changed source set | |
# ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ). | |
# todo: needless rehashes of internal file hash_dir_info.txt | |
# todo: catch "source inside destination" errors and vice-versa | |
# todo: allow to rename sources targets | |
# todo: allow to store hash_dir_info.txt outside source directories | |
# todo: allow exclude dirs/mask | |
# todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue ) | |
#------------------------------------------------ | |
import hashlib | |
import os, sys, shutil, stat | |
import time | |
import itertools | |
import re | |
import traceback | |
#recursively enumerates files in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size ) | |
#cross-platform version for any OS | |
def _listfiles_py(path, relpath, result): | |
if path: | |
path += os.sep | |
try: | |
files = os.listdir(path) | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"' | |
return result | |
for fn in files: | |
fullfn = path + fn | |
relpath2 = (relpath + os.sep if relpath else '') + fn | |
try: | |
st = os.stat(fullfn) | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"' | |
continue | |
file_type = stat.S_IFMT(st.st_mode) | |
if file_type == stat.S_IFDIR: | |
_listfiles_py(fullfn, relpath2, result) | |
elif file_type == stat.S_IFREG: | |
result.append((relpath2, int(st.st_mtime), st.st_size)) | |
#soft-links are skipped | |
return result | |
win32file = None | |
if (os.name == 'nt'): | |
if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks | |
import ctypes | |
from ctypes import WinError | |
from ctypes.wintypes import BOOL | |
CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW | |
CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p] | |
CreateHardLink.restype = BOOL | |
def mk_hard_link(src, dst): | |
res = CreateHardLink(dst, src, None) | |
if res == 0: | |
raise WinError() | |
FindFirstFile = ctypes.windll.kernel32.FindFirstFileW | |
FindNextFile = ctypes.windll.kernel32.FindNextFileW | |
FindClose = ctypes.windll.kernel32.FindClose | |
GetLastError = ctypes.windll.kernel32.GetLastError | |
#optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat) | |
#enumerates non-recursive immediate files and subdirectories in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory ) | |
def FindFilesWin32(path): | |
# pylint: disable=E0611 | |
from exceptions import WindowsError | |
out = ctypes.wintypes.WIN32_FIND_DATAW() | |
fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out)) | |
if fldr == -1: #INVALID_HANDLE_VALUE | |
raise WindowsError("can't list files in the %s" % repr(path)) | |
result = [] | |
try: | |
while 1: | |
fn = out.cFileName | |
if fn != u'.' and fn != u'..': | |
isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY | |
ts = out.ftLastWriteTime | |
win_filetime = (ts.dwHighDateTime << 32) | ts.dwLowDateTime | |
secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970 | |
timestamp = win_filetime / 10000000 - secs_between_epochs | |
size = (out.nFileSizeHigh << 32) | out.nFileSizeLow | |
result.append((out.cFileName, timestamp, size, isdir)) | |
if not FindNextFile(fldr, ctypes.byref(out)): | |
win_error = GetLastError() | |
if win_error == 18: #ERROR_NO_MORE_FILES: | |
break | |
exc = WindowsError(win_error) #todo: test | |
exc.filename = path | |
raise exc | |
finally: | |
FindClose(fldr) | |
return result | |
#recursively enumerates files in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size ) | |
#specialized version for Windows. | |
def ListFilesWin32(path, relpath, result): | |
if path: | |
path += os.sep | |
try: | |
files = FindFilesWin32(path) | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"' | |
return result | |
for (fn, ts, sz, isdir) in files: | |
fullfn = path + fn | |
relpath2 = (relpath + os.sep if relpath else '') + fn | |
if isdir: | |
ListFilesWin32(fullfn, relpath2, result) | |
else: | |
result.append((relpath2, ts, sz)) | |
#soft-links are skipped | |
return result | |
_listfiles = ListFilesWin32 | |
else: # non-windows version of mk_hard_link and _listfiles | |
mk_hard_link = os.link | |
_listfiles = _listfiles_py | |
#recursively enumerates files in directory | |
#returns list of tuples ( relative_path, file_modified_timestamp, file_size ) | |
def listfiles(path): | |
return _listfiles(unicode(path), u'', []) | |
def md5_for_file(filename): | |
# pylint: disable=E1101 | |
md5 = hashlib.md5() | |
with open(filename, 'rb') as f: | |
while True: | |
data = f.read(2**20) | |
if not data: | |
break | |
md5.update(data) | |
return md5.hexdigest() | |
#read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)} | |
def load_hashes(hf_txt): | |
#different systems can have different file-time-modified format | |
if not os.path.isfile(hf_txt): | |
return {} | |
try: | |
res = {} | |
with open(hf_txt, 'rb') as f: | |
for l in f: | |
md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3) | |
res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime)) | |
return res | |
except EnvironmentError: | |
traceback.print_exc() | |
return {} | |
# pretty format of file size, '81731298312' -> '76.1GB' | |
def filesize_fmt(num): | |
for x in ['bytes', 'KB', 'MB', 'GB']: | |
if num < 1024.0 and num > -1024.0: | |
return "%3.1f%s" % (num, x) | |
num /= 1024.0 | |
return "%3.1f%s" % (num, 'TB') | |
# recalculates hashes of files, timestamp of which changed | |
# also calls 'file_processed_callback' on each file (to | |
# have fresh data from OS file-system cache). | |
def get_hashes(dir, prev_info, file_processed_callback): | |
print >> sys.stderr, 'info: traversing', repr(dir) | |
current_files = listfiles(dir) | |
result = [] | |
for relpath, mtime, size in current_files: | |
prev_file_info = prev_info.get(relpath) | |
md5 = None | |
if prev_file_info: | |
prev_md5, prev_size, prev_mtime = prev_file_info | |
if prev_size == size and prev_mtime == mtime: | |
md5 = prev_md5 | |
result.append((relpath, md5, size, mtime)) | |
recalc_count = 0 | |
recalc_size = 0 | |
total_size = 0 | |
for relpath, md5, size, mtime in result: | |
total_size += size | |
if md5 == None: | |
recalc_count += 1 | |
recalc_size += size | |
print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % ( | |
len(result), filesize_fmt(total_size), | |
recalc_count, filesize_fmt(recalc_size)) | |
tmp = [] | |
#todo: make incremental, to allow restart from interrupted point | |
for (relpath, md5, size, mtime) in result: | |
if md5 == None: | |
md5 = md5_for_file(os.path.join(dir, relpath)) | |
tmp.append((relpath, md5, size, mtime)) | |
if file_processed_callback: | |
#after md5_for_file file is in the OS file cache, copy it immediately | |
file_processed_callback(relpath, md5, size, mtime) | |
result = tmp | |
result.sort() | |
return result | |
def write_hashes(hashes_data_list, hf_txt): | |
with open(hf_txt + '.tmp', 'wb') as f: | |
for relpath, md5, size, mtime in hashes_data_list: | |
print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/')) | |
if os.path.exists(hf_txt): | |
os.unlink(hf_txt) | |
os.rename(hf_txt + ".tmp", hf_txt) | |
# call 'get_hashes' on directory, and put in file with fresh hashes | |
# see also 'get_hashes' function | |
def hash_dir(dir, file_processed_callback = None): | |
hf_txt = os.path.join(dir, 'hash_dir_info.txt') | |
prev_info = load_hashes(hf_txt) | |
with open(hf_txt + '.tmp', 'wb') as _f: | |
#check we can open the file to write | |
pass | |
start = time.time() | |
result = get_hashes(dir, prev_info, file_processed_callback) | |
print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds' | |
write_hashes(result, hf_txt) | |
return result | |
# Put md5 hashes from dst/**/* to file dst/'hash_dir_info.txt'. | |
# for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5. | |
# Otherwise hash is recalculated | |
def write_dst_hashes(dst, dst_rel_path2md5): | |
result = [] | |
for relpath, mtime, size in listfiles(dst): | |
md5 = dst_rel_path2md5.get(relpath) | |
if md5 == None: | |
md5 = md5_for_file(os.path.join(dst, relpath)) | |
result.append((relpath, md5, size, mtime)) | |
result.sort() | |
write_hashes(result, os.path.join(dst, 'hash_dir_info.txt')) | |
# Recursively deletes file tree, | |
# handles some real-life problems with unremovable files | |
def my_rmtree(dir): | |
def remove_readonly(fn, path, _excinfo): | |
os.chmod(path, stat.S_IWRITE) #handle read-only files | |
fn(path) | |
shutil.rmtree(unicode(dir), onerror=remove_readonly) | |
# some checks of invalid arguments and access rights | |
def backup_fast_checks(dst_prev, dst_tmp, srcs): | |
assert dst_prev == None or os.path.isdir(dst_prev), dst_prev | |
for src in srcs: | |
if not os.path.isdir(src): | |
print '"%s" is not a directory!' % src | |
exit(1) | |
open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close() | |
os.unlink(os.path.join(dst_tmp, "test_dir_is_writable")) | |
get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs]) | |
for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base): | |
srcs_grouped = list(x[1] for x in srcs_grouped) | |
if len(srcs_grouped) >= 2: | |
raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped)) | |
# get basename of file. Tolerate traling/// slashes and Windows drive letter. | |
def get_src_basename(x): | |
assert os.path.normpath('xxx///') == 'xxx' | |
if os.name == 'nt': | |
assert os.path.normpath('x://\\//\\') == 'x:\\' | |
assert os.path.normpath('xxx//\\//\\') == 'xxx' | |
x = os.path.normpath(x).rstrip(os.sep) | |
if os.name == 'nt' and x[-1:] == ':': | |
x = x[0:-1] #remove ':' from "drive:" | |
return os.path.basename(x) | |
#convert drive path to dir path on Windows | |
def win_drive_to_dir(x): | |
if os.name == 'nt' and x[-1:] == ':': | |
# http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows | |
# path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\' | |
x = x + '\\' | |
return x | |
#load mapping from md5 to filename in previous backup (to hardlink them in future) | |
def load_prev_dst_hashes(dst_prev): | |
print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev | |
if (dst_prev == None): | |
return {} | |
dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt") | |
if not os.path.isfile(dst_prev_hash_fn): | |
print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn | |
dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None) | |
md5to_old = {} | |
# avoid randomization of hardlinks to files with same content - sort by name, select last | |
for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes): | |
md5to_old[md5] = os.path.join(dst_prev, relpath) | |
return md5to_old | |
# create hardlinks to dst_prev or copy srcs files to dst_tmp. | |
# if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c', | |
# then 'a/b/c' is copied to 'x/y/z/c' | |
def backup_impl(dst_prev, dst_tmp, srcs): | |
srcs = [win_drive_to_dir(src) for src in srcs] | |
if not os.path.isdir(dst_tmp): | |
os.makedirs(dst_tmp) | |
backup_fast_checks(dst_prev, dst_tmp, srcs) | |
md5to_old = load_prev_dst_hashes(dst_prev) | |
dst_rel_path2md5 = {} | |
for src in srcs: | |
print >> sys.stderr, 'info: copying src="%s"...' % src | |
src_base = get_src_basename(src) | |
current_dst = os.path.join(dst_tmp, src_base) | |
if os.path.exists(current_dst): | |
print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst) | |
my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD | |
if not os.path.isdir(current_dst): | |
os.makedirs(current_dst) | |
created_dirs = set() # don't check known directories | |
def file_processed_callback(relpath, md5, size, _mtime): | |
full_dst_fn = os.path.join(current_dst, relpath) | |
full_dst_dir = os.path.dirname(full_dst_fn) | |
if full_dst_dir not in created_dirs: | |
if not os.path.isdir(full_dst_dir): | |
os.makedirs(full_dst_dir) | |
created_dirs.add(full_dst_dir) | |
try: | |
if size == 0: | |
#NTFS hard-links limit = 1023, empty file is most frequent | |
open(full_dst_fn, 'wb').close() | |
elif md5 in md5to_old: | |
#print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam | |
mk_hard_link(md5to_old[md5], full_dst_fn) | |
else: | |
if dst_prev != None: #a lot of copies, when run first time | |
print >> sys.stderr, 'verbose: full copy', repr(relpath) | |
shutil.copy(os.path.join(src, relpath), full_dst_fn) | |
dst_rel_path = os.path.join(src_base, relpath) | |
dst_rel_path2md5[dst_rel_path] = md5 | |
except EnvironmentError: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath) | |
hash_dir(src, file_processed_callback) | |
write_dst_hashes(dst_tmp, dst_rel_path2md5) | |
#todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks). | |
# calls 'backup_impl' function. See also 'backup_impl'. | |
# Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst' | |
def backup(dst_prev, dst, srcs): | |
st = time.time() | |
print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs) | |
if os.path.exists(dst): | |
print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst | |
exit(1) | |
backup_impl(dst_prev, dst + ".tmp", srcs) | |
os.rename(dst + ".tmp", dst) | |
backup_complete_file = os.path.join(dst, 'backup.complete') | |
with open(backup_complete_file, 'wb') as f: | |
print >> f, 'last backup time: "%s"' % time.time() | |
print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs) | |
print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st)) | |
# select directory with most recent root/*/backup.complete file. | |
def select_last_previous_backup(root): | |
root = win_drive_to_dir(root) | |
alternatives = [] | |
if not os.path.isdir(root): | |
print '"%s" is not a directory!' % root | |
exit(1) | |
for dir in os.listdir(root): | |
dst_prev = os.path.join(root, dir) | |
if os.path.isdir(dst_prev): | |
backup_complete_file = os.path.join(dst_prev, 'backup.complete') | |
if os.path.isfile(backup_complete_file): | |
with open(backup_complete_file) as f: | |
try: | |
m = re.match('last backup time: "(.*)"$', f.read().strip()) | |
backup_time = float(m.group(1)) | |
alternatives.append((backup_time, dst_prev)) | |
except: | |
traceback.print_exc() | |
print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file | |
dst_prev = None | |
if alternatives: | |
_backup_time, dst_prev = max(alternatives) | |
print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev | |
else: | |
print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root | |
return dst_prev | |
def main(): | |
if len(sys.argv) < 3: | |
prog_name = sys.argv[0] if sys.argv else '' | |
print >> sys.stderr, USAGE.replace("prog_name", prog_name) | |
exit(1) | |
root = sys.argv[1] | |
srcs = sys.argv[2:] | |
dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S") | |
backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs) | |
#compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time. | |
def my_listdir_functions_test(): | |
for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]: | |
t = time.time() | |
with open(out, 'wb') as f: | |
path = u"c:" | |
for relpath, mtime, size in lf(unicode(path), u'', []): | |
relpath = relpath.replace('\\', '/').encode('UTF-8') | |
if 'test/string/trunk/bin/backup' not in relpath: #my development source directory | |
print >> f, '%s\t%s\t%s' % (relpath, mtime, size) | |
print >> sys.stderr, 'TIME:', lf, time.time() - t | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment