Skip to content

Instantly share code, notes, and snippets.

@mark-kubacki
Last active August 14, 2018 22:00
Show Gist options
  • Save mark-kubacki/12af660744f8f98695bdf44af798d42f to your computer and use it in GitHub Desktop.
Save mark-kubacki/12af660744f8f98695bdf44af798d42f to your computer and use it in GitHub Desktop.
file-level deduplication in O(n) utilizing reflinks
#!/bin/bash
# Attribute every regular file with its hash (aka "digest" here)
# assuming that its contents won't change until the deduplication has finished.
# Else a 'cmp' in the next script will fail and no dud with unexpected content will be made.
# You can safely run this script repeatedly.
set -euo pipefail
for req in b2sum getfattr setfattr; do
if ! command -v $req &>/dev/null; then
>&2 printf "command not found: %s\n" $req
exit 2
fi
done
: ${label:="user.digest"}
while read -r -u3 F || [[ -n "$F" ]]; do
if getfattr --only-values -n "${label}" "${F}" &>/dev/null; then
# has been annotated, skip
printf "."
continue
fi
read -r -d ' ' digest < <(b2sum --binary --length 64 "${F}")
setfattr -n "${label}" -v "${digest}" "${F}"
printf "+"
done 3< <(find ! -path "*.git*" -type f -size +1M)
printf "\n"
exit 0
#!/bin/bash
set -euo pipefail
: ${label:="user.digest"}
# In case of a collission the infringing file gets skipped.
# map: digest → filename
declare -A flist
while read -r -u3 F || [[ -n "$F" ]]; do
if ! getfattr --only-values -n "${label}" "${F}" &>/dev/null; then
printf "."
continue
fi
digest="$(getfattr --only-values -n "${label}" "${F}")"
if [[ -z "${digest}" ]]; then
printf "!"
continue
fi
if [[ ! -v flist["${digest}"] ]]; then
# new digest, original content, no duplicate
flist["${digest}"]="${F}"
printf "°"
continue
fi
original="${flist[${digest}]}"
if cmp --quiet "${original}" "${F}"; then
if [[ -e "${F}"~ ]]; then
>&2 printf "~"
continue
fi
# Now you'd lock the original and $F to prevent a write-after-cmp race condition,
# but that's not possible in BASH.
printf "+"
cp -a --reflink=always "${original}" "${F}"~
touch --reference="${F}" "${F}"~
mv "${F}"~ "${F}"
else
>&2 printf "C"
fi
done 3< <(find ! -path "*.git*" -type f -size +1M)
printf "\n"
>&2 printf "Done deduplicating. Press ENTER to de-annotate any originals."
read
for F in "${flist[@]}"; do
if [[ ! -e "${F}" ]]; then
continue
fi
setfattr --remove="${label}" "${F}" >/dev/null
done
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment