Skip to content

Instantly share code, notes, and snippets.

@Lukey3332
Last active July 15, 2021 18:07
Show Gist options
  • Save Lukey3332/203ea6f30d48323e7bd1d05c16b5da9c to your computer and use it in GitHub Desktop.
Save Lukey3332/203ea6f30d48323e7bd1d05c16b5da9c to your computer and use it in GitHub Desktop.
git annex incremental sync
#!/bin/bash
# Copyright (c) 2021 Lukas Straub <lukasstraub2@web.de>
set -o pipefail
set -e
contentof=
if [ "$1" = "--fast" ]; then
contentof="--content-of=inc-sync"
fi
if ! [ -d .git ]; then
echo "Must be run at the root of a git-annex repo."
exit 1
fi
mkdir -p .git/inc-sync
git annex sync -J4 --no-content
# First check which remotes are actually available right now
fullsync=0
uuids=( )
commits=( )
while read remote; do
read type
read uuid
file=".git/inc-sync/${uuid}"
if [ -z "$uuid" ]; then
continue
fi
# No way to test special-remotes, assume they are available
if [ "$type" = "git" ]; then
if ! git fetch "$remote" >/dev/null 2>&1; then
continue
fi
fi
uuids+=( "$uuid" )
if [ -f "$file" ]; then
read commit <"$file"
commits+=( "$commit" )
else
fullsync=1
fi
done < <(git remote | git annex info --batch --fast | \
awk '/^uuid: |^remote: |^type: /{
lines[$1]=substr($0, length($1)+2);
n++;
if(n == 3){
print lines["remote:"] "\n" lines["type:"] "\n" lines["uuid:"];
n=0;
}
}')
if [ ${#uuids[@]} -eq 0 ]; then
echo "No remotes available"
exit 0
fi
if [ ${#commits[@]} -eq 0 ]; then
fullsync=1
fi
# Check if anything relevant to preferred content expressions changed
checklogs=( "numcopies.log" "trust.log" "group.log" "preferred-content.log" \
"required-content.log" "group-preferred-content.log" "transitions.log" )
printf '%s\n' "${checklogs[@]}" | \
awk '{print "git-annex:" $0 " .git/inc-sync/" $0}' | \
git cat-file --batch='%(rest)' --buffer | \
awk '{
file=$1;
if ($2 == "missing") next;
RS=""; getline stored;
RS="^$"; getline cached <file; close(file);
cached=substr(cached, 0, length(cached)-1);
if (stored != cached) exit 1;
RS="\n"
}' || { rm -f .git/inc-sync/*; fullsync=1; } # Something changed, do a full sync
if (( fullsync )); then
currentcommit="$(git rev-parse git-annex)"
echo "Falling back to full sync."
# The outer loop is there since we process remotes sequentially and not all at
# once for each key. Say we process remote A first, then remote B. If the
# preferred-content expression for a key in remote A depends on wheter it is
# present on remote B and we copy/drop the key from remote B, we need to reevaluate
# remote A again. That's what the outer loop does: as long as the git-annex branch
# changes (due to location logs) in each iteration, it repeats.
prevcommit=""
while commit="$(git rev-parse git-annex)"; [ "$prevcommit" != "$commit" ]; do
prevcommit="$commit"
for uuid in "${uuids[@]}"; do
# Make location logs more similar by setting a set timestamp
# This way, git can dedup most of them
export GIT_ANNEX_VECTOR_CLOCK=$(date '+%s')
if git annex sync --only-annex --content --all --no-pull "$uuid"; then
# Remote synced successfully, record the tip of the git-annex
# branch where it last synced successfully
echo "$currentcommit" >".git/inc-sync/${uuid}"
fi
done
done
printf '%s\n' "${checklogs[@]}" | \
awk '{print "git-annex:" $0 " .git/inc-sync/" $0}' | \
git cat-file --batch='%(rest)' --buffer | \
awk '{
file=$1;
if ($2 == "missing") next;
RS=""; getline stored;
print stored >file; close(file);
RS="\n"
}'
else
# Incremental sync: Only check the keys that where added / whose location changed
# since the last successful sync.
lowestcommit="$(git merge-base --octopus ${commits[*]})"
currentcommit="$(git rev-parse git-annex)"
# Did the tip of the git-annex actually change since the last time we synced?
# If not, we don't need to sync, since none of the location logs changed (and no
# files where added) and thus the preferred-content expressions where already
# satisfied the last time
if [ "$lowestcommit" != "$currentcommit" ]; then
# Ignore ctrl+c
trap "" INT
# git-annex-sync doesn't have --batch and even if it did, --batch can't work with keys
# anyway. As a workaround, link the keys to sync in a directory and use --content-of=
# to sync only the keys within that directory.
mkdir inc-sync
git diff-tree -r --name-only "$lowestcommit" git-annex | \
awk '/???\/???\/.*.log$/{out=substr($0, 9, length($0)-12); print out " inc-sync/" out}
/???\/???\/.*.log.met$/{out=substr($0, 9, length($0)-16); print out " inc-sync/" out}' | \
git annex fromkey --force --batch
git add inc-sync
prevcommit=""
while commit="$(git rev-parse git-annex)"; [ "$prevcommit" != "$commit" ]; do
prevcommit="$commit"
for uuid in "${uuids[@]}"; do
# Make location logs more similar by setting a set timestamp
export GIT_ANNEX_VECTOR_CLOCK=$(date '+%s')
if git annex sync --only-annex --content $contentof --no-pull "$uuid"; then
echo "$currentcommit" >".git/inc-sync/${uuid}"
fi
done
done
git reset --hard
rmdir inc-sync >/dev/null 2>&1 || true
fi
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment