-
-
Save Lukey3332/203ea6f30d48323e7bd1d05c16b5da9c to your computer and use it in GitHub Desktop.
git annex incremental sync
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Copyright (c) 2021 Lukas Straub <lukasstraub2@web.de> | |
set -o pipefail | |
set -e | |
contentof= | |
if [ "$1" = "--fast" ]; then | |
contentof="--content-of=inc-sync" | |
fi | |
if ! [ -d .git ]; then | |
echo "Must be run at the root of a git-annex repo." | |
exit 1 | |
fi | |
mkdir -p .git/inc-sync | |
git annex sync -J4 --no-content | |
# First check which remotes are actually available right now | |
fullsync=0 | |
uuids=( ) | |
commits=( ) | |
while read remote; do | |
read type | |
read uuid | |
file=".git/inc-sync/${uuid}" | |
if [ -z "$uuid" ]; then | |
continue | |
fi | |
# No way to test special-remotes, assume they are available | |
if [ "$type" = "git" ]; then | |
if ! git fetch "$remote" >/dev/null 2>&1; then | |
continue | |
fi | |
fi | |
uuids+=( "$uuid" ) | |
if [ -f "$file" ]; then | |
read commit <"$file" | |
commits+=( "$commit" ) | |
else | |
fullsync=1 | |
fi | |
done < <(git remote | git annex info --batch --fast | \ | |
awk '/^uuid: |^remote: |^type: /{ | |
lines[$1]=substr($0, length($1)+2); | |
n++; | |
if(n == 3){ | |
print lines["remote:"] "\n" lines["type:"] "\n" lines["uuid:"]; | |
n=0; | |
} | |
}') | |
if [ ${#uuids[@]} -eq 0 ]; then | |
echo "No remotes available" | |
exit 0 | |
fi | |
if [ ${#commits[@]} -eq 0 ]; then | |
fullsync=1 | |
fi | |
# Check if anything relevant to preferred content expressions changed | |
checklogs=( "numcopies.log" "trust.log" "group.log" "preferred-content.log" \ | |
"required-content.log" "group-preferred-content.log" "transitions.log" ) | |
printf '%s\n' "${checklogs[@]}" | \ | |
awk '{print "git-annex:" $0 " .git/inc-sync/" $0}' | \ | |
git cat-file --batch='%(rest)' --buffer | \ | |
awk '{ | |
file=$1; | |
if ($2 == "missing") next; | |
RS=""; getline stored; | |
RS="^$"; getline cached <file; close(file); | |
cached=substr(cached, 0, length(cached)-1); | |
if (stored != cached) exit 1; | |
RS="\n" | |
}' || { rm -f .git/inc-sync/*; fullsync=1; } # Something changed, do a full sync | |
if (( fullsync )); then | |
currentcommit="$(git rev-parse git-annex)" | |
echo "Falling back to full sync." | |
# The outer loop is there since we process remotes sequentially and not all at | |
# once for each key. Say we process remote A first, then remote B. If the | |
# preferred-content expression for a key in remote A depends on wheter it is | |
# present on remote B and we copy/drop the key from remote B, we need to reevaluate | |
# remote A again. That's what the outer loop does: as long as the git-annex branch | |
# changes (due to location logs) in each iteration, it repeats. | |
prevcommit="" | |
while commit="$(git rev-parse git-annex)"; [ "$prevcommit" != "$commit" ]; do | |
prevcommit="$commit" | |
for uuid in "${uuids[@]}"; do | |
# Make location logs more similar by setting a set timestamp | |
# This way, git can dedup most of them | |
export GIT_ANNEX_VECTOR_CLOCK=$(date '+%s') | |
if git annex sync --only-annex --content --all --no-pull "$uuid"; then | |
# Remote synced successfully, record the tip of the git-annex | |
# branch where it last synced successfully | |
echo "$currentcommit" >".git/inc-sync/${uuid}" | |
fi | |
done | |
done | |
printf '%s\n' "${checklogs[@]}" | \ | |
awk '{print "git-annex:" $0 " .git/inc-sync/" $0}' | \ | |
git cat-file --batch='%(rest)' --buffer | \ | |
awk '{ | |
file=$1; | |
if ($2 == "missing") next; | |
RS=""; getline stored; | |
print stored >file; close(file); | |
RS="\n" | |
}' | |
else | |
# Incremental sync: Only check the keys that where added / whose location changed | |
# since the last successful sync. | |
lowestcommit="$(git merge-base --octopus ${commits[*]})" | |
currentcommit="$(git rev-parse git-annex)" | |
# Did the tip of the git-annex actually change since the last time we synced? | |
# If not, we don't need to sync, since none of the location logs changed (and no | |
# files where added) and thus the preferred-content expressions where already | |
# satisfied the last time | |
if [ "$lowestcommit" != "$currentcommit" ]; then | |
# Ignore ctrl+c | |
trap "" INT | |
# git-annex-sync doesn't have --batch and even if it did, --batch can't work with keys | |
# anyway. As a workaround, link the keys to sync in a directory and use --content-of= | |
# to sync only the keys within that directory. | |
mkdir inc-sync | |
git diff-tree -r --name-only "$lowestcommit" git-annex | \ | |
awk '/???\/???\/.*.log$/{out=substr($0, 9, length($0)-12); print out " inc-sync/" out} | |
/???\/???\/.*.log.met$/{out=substr($0, 9, length($0)-16); print out " inc-sync/" out}' | \ | |
git annex fromkey --force --batch | |
git add inc-sync | |
prevcommit="" | |
while commit="$(git rev-parse git-annex)"; [ "$prevcommit" != "$commit" ]; do | |
prevcommit="$commit" | |
for uuid in "${uuids[@]}"; do | |
# Make location logs more similar by setting a set timestamp | |
export GIT_ANNEX_VECTOR_CLOCK=$(date '+%s') | |
if git annex sync --only-annex --content $contentof --no-pull "$uuid"; then | |
echo "$currentcommit" >".git/inc-sync/${uuid}" | |
fi | |
done | |
done | |
git reset --hard | |
rmdir inc-sync >/dev/null 2>&1 || true | |
fi | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment