Last active
August 14, 2018 22:00
-
-
Save mark-kubacki/12af660744f8f98695bdf44af798d42f to your computer and use it in GitHub Desktop.
file-level deduplication in O(n) utilizing reflinks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -eu | |
# The idea is to create a large file that will be written to disk and not into the filesystem tree. | |
# reflinks work if a copy shares disk space. | |
# Do not use 'fallocate' or the like, because some filesystems will compress it and store the result in their tree. | |
if ! command -v filefrag &>/dev/null; then | |
>&2 printf "command not found: filefrag\n" | |
exit 2 | |
fi | |
# This gets us a "large" file of about 4 MiB. ('largefile' is usually a blocksize of 1 MiB) | |
curl --fail --silent --show-error --location -O \ | |
https://s.blitznote.com/debs/ubuntu/amd64/sendmail | |
function cleanup { | |
rm -f sendmail{,~} | |
} | |
trap cleanup EXIT | |
cp --reflink=always sendmail sendmail~ | |
# 'filefrag' is from ext2fstools or a similarly named package. | |
if filefrag -se sendmail~ | grep -q -F 'shared'; then | |
printf "YES, the filesystem seems to understand reflinks\n" | |
exit 0 | |
fi | |
>&2 printf "NO, reflinks don't work here\n" | |
exit 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Attribute every regular file with its hash (aka "digest" here) | |
# assuming that its contents won't change until the deduplication has finished. | |
# Else a 'cmp' in the next script will fail and no dud with unexpected content will be made. | |
# You can safely run this script repeatedly. | |
set -euo pipefail | |
for req in b2sum getfattr setfattr; do | |
if ! command -v $req &>/dev/null; then | |
>&2 printf "command not found: %s\n" $req | |
exit 2 | |
fi | |
done | |
: ${label:="user.digest"} | |
while read -r -u3 F || [[ -n "$F" ]]; do | |
if getfattr --only-values -n "${label}" "${F}" &>/dev/null; then | |
# has been annotated, skip | |
printf "." | |
continue | |
fi | |
read -r -d ' ' digest < <(b2sum --binary --length 64 "${F}") | |
setfattr -n "${label}" -v "${digest}" "${F}" | |
printf "+" | |
done 3< <(find ! -path "*.git*" -type f -size +1M) | |
printf "\n" | |
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -euo pipefail | |
: ${label:="user.digest"} | |
# In case of a collission the infringing file gets skipped. | |
# map: digest → filename | |
declare -A flist | |
while read -r -u3 F || [[ -n "$F" ]]; do | |
if ! getfattr --only-values -n "${label}" "${F}" &>/dev/null; then | |
printf "." | |
continue | |
fi | |
digest="$(getfattr --only-values -n "${label}" "${F}")" | |
if [[ -z "${digest}" ]]; then | |
printf "!" | |
continue | |
fi | |
if [[ ! -v flist["${digest}"] ]]; then | |
# new digest, original content, no duplicate | |
flist["${digest}"]="${F}" | |
printf "°" | |
continue | |
fi | |
original="${flist[${digest}]}" | |
if cmp --quiet "${original}" "${F}"; then | |
if [[ -e "${F}"~ ]]; then | |
>&2 printf "~" | |
continue | |
fi | |
# Now you'd lock the original and $F to prevent a write-after-cmp race condition, | |
# but that's not possible in BASH. | |
printf "+" | |
cp -a --reflink=always "${original}" "${F}"~ | |
touch --reference="${F}" "${F}"~ | |
mv "${F}"~ "${F}" | |
else | |
>&2 printf "C" | |
fi | |
done 3< <(find ! -path "*.git*" -type f -size +1M) | |
printf "\n" | |
>&2 printf "Done deduplicating. Press ENTER to de-annotate any originals." | |
read | |
for F in "${flist[@]}"; do | |
if [[ ! -e "${F}" ]]; then | |
continue | |
fi | |
setfattr --remove="${label}" "${F}" >/dev/null | |
done | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment