Skip to content

Instantly share code, notes, and snippets.

@charles-dyfis-net
Created June 17, 2020 16:28
Show Gist options
  • Save charles-dyfis-net/bfb0e30862f04957d020afe0ff8b093b to your computer and use it in GitHub Desktop.
Save charles-dyfis-net/bfb0e30862f04957d020afe0ff8b093b to your computer and use it in GitHub Desktop.
Borg reflink deduplication helper scripts

Long ago, I had to restore a Borg backup of a filesystem with heavily-deduplicated content -- which, without that deduplication, wouldn't come even close to fitting on the target drive.

To fix this problem, I threw together some scripts to use reflinks to deduplicate restored content -- more quickly than running bees could do it.

(This was a NixOS system, so many files live in a read-only, world-readable store under /nix/store; added some logic to hardlink those together when entire files precisely match instead of using reflinks at all; pull it if not comfortable doing that).

This is what I still have of them. YMMV.

#!/usr/bin/env python
import errno
import json
import os
import stat
import sys
try:
from shlex import quote
except ImportError:
from pipes import quote
def quoteList(argList):
return ' '.join(quote(str(s)) for s in argList)
if len(sys.argv) > 1:
input_file = open(sys.argv[1], 'r')
else:
input_file = sys.stdin
MIN_CHUNK_SIZE=4096
inode = None
filename = ''
last_filename = ''
last_chunkdef = []
last_inode = None
def log(s):
if 'verbose' in os.environ:
sys.stderr.write("%s: %s\n" % (filename, s))
sys.stderr.flush()
for line in input_file:
line = line.rstrip()
filename, chunkdef_str = line.split('\t')
chunkdef = json.loads(chunkdef_str)
# empty file, ignore this
if chunkdef == []:
log("skipping; empty")
continue
try:
# already hardlinked, nothing to do
inode = os.stat(filename).st_ino
if inode == last_inode:
log("already hardlinked")
continue
except OSError as e:
# not restored yet, or deleted
if e.errno == errno.ENOENT:
log("ENOENT")
continue
else:
raise #unknown error, rethrow
if last_inode is not None:
# entire file is an exact match, hardlink it (THIS IS ONLY SAFE FOR CONTENT IN THE STORE)
if chunkdef == last_chunkdef and filename.startswith('nix/store'):
cmd = quoteList(['ln', '-f', '--', last_filename, filename])
sys.stdout.write(cmd)
sys.stdout.write('\n')
sys.stdout.flush()
elif 'hardlink_only' is os.environ:
log("reflinking disabled")
else:
identical_prefix = 0
for idx in range(min([len(chunkdef), len(last_chunkdef)])):
if chunkdef[idx] == last_chunkdef[idx]:
identical_prefix += chunkdef[idx][1]
identical_prefix = (identical_prefix // MIN_CHUNK_SIZE) * MIN_CHUNK_SIZE
if identical_prefix:
xfs_io_cmd = quoteList([ 'dedupe', last_filename, 0, 0, identical_prefix ])
cmd = quoteList([ 'xfs_io', '-c', xfs_io_cmd, filename ])
sys.stdout.write(cmd)
sys.stdout.write('\n')
sys.stdout.flush()
last_inode = inode
last_chunkdef = chunkdef
last_filename = filename
# FIXME: Reverse order so the content which can't contain a literal tab (the JSON) goes before the content that can (the filename).
# FIXME: Also add a NUL after that name?
# Note that the sort part of this is *much* faster than the jq logic; yes, even in stream mode.
# Also note that this *absolutely requires* jq 1.6 to work.
build_file_chunk_map_from_archive_dump() {
jq -nc --stream 'fromstream(2 | truncate_stream(inputs | select(.[0][0] == "_items")))' \
| jq -r 'select(.chunks? != null) | [.path, (.chunks | tojson)] | @tsv' \
| sort -t $'\t' -k2,2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment