Skip to content

Instantly share code, notes, and snippets.

@apirogov
Created September 7, 2022 18:08
Show Gist options
  • Save apirogov/c2253daf105d813349c6ae471e97a2d7 to your computer and use it in GitHub Desktop.
Save apirogov/c2253daf105d813349c6ae471e97a2d7 to your computer and use it in GitHub Desktop.
Script to fix moved/renamed files before running rsync to avoid useless retransmissions
#!/usr/bin/env python3
"""
Fix moved/renamed files before running rsync to avoid useless retransmissions.
Copyright (C) 2022 Anton Pirogov, licensed under the MIT License
"""
from itertools import chain
from typing import List, Dict, Optional
from dataclasses import dataclass
from pathlib import Path
from datetime import datetime
from shlex import quote
import argparse
import subprocess
HASHSUM_TOOL = "sha256sum" # NOTE: must be available on both SRC and DEST host
verbose: bool = False
def arg_parser():
parser = argparse.ArgumentParser(prog=Path(__file__).name, description=__doc__)
parser.add_argument("-v", "--verbose", action="store_true", default=False)
parser.add_argument("-s", "--script", action="store_true", default=False,
help="Just output script executing detected moves ('dry run')")
parser.add_argument("-e", "--rsh", default="ssh",
help="rsync remote shell command to use")
parser.add_argument("-f", "--filter-args", default="",
help="rsync args affecting file list")
parser.add_argument("SRC")
parser.add_argument("DEST")
return parser
def main():
global verbose
args = arg_parser().parse_args()
verbose = args.verbose
assert not args.SRC.endswith("/"), "SRC must not have trailing slash!"
assert not args.DEST.endswith("/"), "DEST must not have trailing slash!"
if verbose:
print("---- find moved/renamed files: ----")
moves = compute_moves(args)
if verbose:
print("-------- required actions: --------")
for cmd, src, trg in moves:
print(cmd, quote(src), quote(trg))
print("-------- perform actions: --------")
for move in moves:
cmd = dst_cmd(args, *move)
print(cmd) # always show cmd! if something goes wrong, it helps debugging
if not args.script:
run(cmd)
if verbose:
print("-----------------------------------")
# ----
@dataclass
class FileInfo:
# parsed from output
filename: str
permissions: str
size: int
timestamp: datetime
# for candidates:
# additionally computed hashsum, or the size if it is unique in file set
hashsum: Optional[str]
@property
def is_dir(self):
return self.permissions[0] == "d"
def run(cmd, check=True) -> bytes:
"""Run command and return standard output."""
if verbose:
print(f"run: {cmd}")
return subprocess.run(cmd, shell=True, check=check, capture_output=True).stdout
def rsync_list_cmd(args, dir: str) -> str:
"""Command to list rsync candidate files."""
return f"rsync {args.filter_args} -e '{args.rsh}' --list-only {dir}"
def rsync_list(args, dir) -> Dict[str, FileInfo]:
"""Return FileInfo objects for each rsync candidate."""
output = run(rsync_list_cmd(args, dir))
lines = output.decode("utf-8").splitlines()
parsed = [l.split(maxsplit=4) for l in lines]
ret = {}
for permissions, size, date, time, path in parsed:
sz = int(size.replace(".", ""))
timestamp = datetime.strptime(f"{date} {time}", '%Y/%m/%d %H:%M:%S')
ret[path] = FileInfo(Path(path).name, permissions, sz, timestamp, hashsum=None)
return ret
def hashsum(args, path, is_src: bool=True) -> Optional[str]:
"""Compute hashsum of a file (at source or target)."""
root = args.SRC if is_src else args.DEST
is_local = root.split("/")[0].find(":") < 0
if is_local:
cmd_pref = ""
filepath = str(Path(root).parent / path)
else:
remote_host, remote_dir = root.split(":")
cmd_pref = f"{args.rsh} {remote_host} "
filepath = f"{remote_dir}/{path}"
cmd = f"{HASHSUM_TOOL} {quote(filepath)}"
if not is_local:
cmd = f"{cmd_pref} {quote(cmd)}"
ret = run(cmd, check=False)
if not ret:
return None
else:
return ret.decode("utf-8").split()[0]
def dst_cmd(args, cmd, src_path, trg_path) -> str:
root = args.DEST
is_local = root.split("/")[0].find(":") < 0
remote_host, remote_dir = (None, None)
if not is_local:
remote_host, remote_dir = root.split(":")
def wrap_cmd(cmd):
if is_local:
return cmd
else:
cmd_pref = f"{args.rsh} {remote_host} "
return f"{cmd_pref} {quote(cmd)}"
def fullpath(path):
if is_local:
return str(Path(root).parent / path)
else:
return f"{remote_dir}/{path}"
src, trg = fullpath(src_path), fullpath(trg_path)
trgdir = "/".join(trg.split("/")[:-1]) or "."
return wrap_cmd(f"mkdir -p {quote(trgdir)} && {cmd} -n {quote(src)} {quote(trg)}")
# ----
def keep_with_hashsum(info_dict):
return { k: v for k, v in info_dict.items() if v.hashsum }
def classify_paths(src_files, dst_files):
added, removed, modified = set(), set(), set()
for path in set.union(set(src_files.keys()), set(dst_files.keys())):
in_src, in_trg = path in src_files, path in dst_files
if in_src and not in_trg:
added.add(path)
elif in_trg and not in_src:
removed.add(path)
elif in_src and in_trg:
src_file, dst_file = src_files[path], dst_files[path]
# if file -> dir / dir -> file, this entity was modified
diff_dirfile = src_file.is_dir != dst_file.is_dir
# if size is different, it could still be e.g. part of linear/circular move
# for bigger files, probability of different "real" files to be same size
# is very small -> almost as good as hashsum to detect difference
# for small files does not matter, they can be trasferred fast anyway
diff_size = src_file.size != dst_file.size
# different hashsum -> different file with very high likelihood
both_hsum = src_file.hashsum and dst_file.hashsum
diff_hsum = both_hsum and src_file.hashsum != dst_file.hashsum
if diff_size or diff_dirfile or diff_hsum:
modified.add(path)
return (added, modified, removed)
def hash_to_path_dict(fileinfos: Dict[str, FileInfo]) -> Dict[str, List[str]]:
ret = {}
for p, i in fileinfos.items():
if i.hashsum is None:
continue
if i.hashsum not in ret:
ret[i.hashsum] = []
ret[i.hashsum].append(p)
return ret
def resolve_moves(src_files, dst_files, added, modified, removed):
MV_CMD, CP_CMD = "mv", "cp"
# reverse lookup of paths by hashsum
h_to_src_file = hash_to_path_dict(src_files)
h_to_trg_file = hash_to_path_dict(dst_files)
# get hashsums of files that were possibly moved:
# hashsums of new or mod paths in source
src_hsums = {src_files[p].hashsum for p in set.union(added, modified)}
# hashsums of removed or mod paths in target
trg_hsums = {dst_files[p].hashsum for p in set.union(removed, modified)}
# these were moved (hashsum with different paths, exist in target):
moved_hsums = set.intersection(src_hsums, trg_hsums)
# compute moves to perform in target
ret_direct, ret_pre, ret_post = [], [], []
for h in moved_hsums:
src_path = h_to_trg_file[h][0] # any file with that hashsum is ok
trg_path, copies = h_to_src_file[h][0], h_to_src_file[h][1:]
if trg_path not in dst_files:
ret_direct.append([MV_CMD, src_path, trg_path])
elif trg_path in src_files:
ret_pre.append([MV_CMD, src_path, trg_path + "_tmp"])
ret_post.append([MV_CMD, trg_path + "_tmp", trg_path])
for trg_copy in copies:
if trg_copy in dst_files:
ret_pre.append([MV_CMD, trg_copy, trg_copy + "_old"])
ret_post.append([CP_CMD, trg_path, trg_copy])
# return sequence of shell commands in correct order
return ret_direct + ret_pre + ret_post
def compute_moves(args):
# get list of candidates both locally and on remote
DST = f"{args.DEST}/{args.SRC.split('/')[-1]}"
src_files, dst_files = rsync_list(args, args.SRC), rsync_list(args, DST)
# figure out which file size is unique
sizes = {} # size -> filename
for path, info in chain(src_files.items(), dst_files.items()):
if info.size not in sizes:
sizes[info.size] = set()
sizes[info.size].add(path)
# use file size as hashsum replacement if it is unique
unique_sizes = { size for size, names in sizes.items() if len(names)<2 }
for path, info in chain(src_files.items(), dst_files.items()):
if info.size in unique_sizes:
info.hashsum = str(info.size)
# get hashsums for files that might have been moved (will compute hashsum for these)
# these are files that exist only in src/dest or that look modified (diff. size)
c_add, c_mod, c_rem = classify_paths(src_files, dst_files)
for path in set.union(c_add, c_mod):
src = src_files[path]
if src.is_dir or src.hashsum:
print(f"{path}: skip (dir or unique file size)")
continue
src.hashsum = hashsum(args, path, is_src=True)
for path in set.union(c_rem, c_mod):
dst = dst_files[path]
if dst.is_dir or dst.hashsum:
print(f"{path}: skip (dir or unique file size)")
continue
dst.hashsum = hashsum(args, path, is_src=False)
# now do re-classification for remaining candidates with hashsum
src_files, dst_files = keep_with_hashsum(src_files), keep_with_hashsum(dst_files)
add, mod, rem = classify_paths(src_files, dst_files)
return resolve_moves(src_files, dst_files, add, mod, rem)
# ----
if __name__ == "__main__":
main()
@apirogov
Copy link
Author

apirogov commented Sep 7, 2022

Usage:

I have something like this in my bashrc:

function updatebackup() {
  LOCAL_DATA_DIR=~/myfiles
  RSH_CMD="ssh -p $BACKUP_HOST"
  RSYNC_FILT_ARGS="-aup -FF"
  RSYNC_ARGS="--info=progress2 --delete"
  rsync-prepare -v -e "$RSH_CMD" -f "$RSYNC_FILT_ARGS" $LOCAL_DATA_DIR $REMOTE_DATA_DIR
  rsync -v -e "$RSH_CMD" $RSYNC_ARGS $RSYNC_FILT_ARGS  $LOCAL_DATA_DIR $REMOTE_DATA_DIR
}

@gbabin
Copy link

gbabin commented Nov 25, 2022

Paths are broken if source is remote and destination is local.

@apirogov
Copy link
Author

Ah well, I did not test that combination (because I only use it for local -> remote).

I'd love to see that fleshed out into a proper and tested CLI tool, but currently I have no spare time to fix that script or polish it.

If you do some work based on it, would be great if you upload the improved version and link it here :)

@gbabin
Copy link

gbabin commented Dec 3, 2022

I created a fork at https://github.com/gbabin/rsync-prelude and made a few changes.

@openandclose
Copy link

openandclose commented Feb 17, 2023

For me the code is difficult, so I added a new test to make sure.
https://github.com/openandclose/rsync-prelude/tree/myfork

@alexandervlpl
Copy link

This is useful, but there is a production-ready tool built exactly to minimize "useless retransmissions": BorgBackup. It even works on a sub-file level! You can move, rename, AND change only 3 bytes in a 10GB file and Borg will only copy the one chunk that changed and nothing else. There's nothing better. 🤩

@apirogov
Copy link
Author

My use-case for this is simply syncing the contents of my laptop hard-drive to an external harddrive connected to a raspberry pi in my home network, which I do sporadically and run it by hand. I don't want to overengineer a complex personal backup system I become dependent on and have to maintain or fix when something does not work, I want something dead-simple here.

If you need serious backup with a way to revert to past old states, sure - other tools are definitely better suited.
For everyone else, also see this discussion.

@Konfekt
Copy link

Konfekt commented May 17, 2024

@alexandervlpl I was under the impression that borg works with repositories, so that you need to mount them to acess the files, whereas rsync simply syncs files, readily accessible

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment