Last active
April 5, 2024 14:53
-
-
Save donnaken15/f95e8a143bb330fcf7d6268a4d6929e8 to your computer and use it in GitHub Desktop.
Group duplicates of files into hardlinks, check using SHA256 and matching file size
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
[ $# -eq 1 ] && { | |
echo you must specify more than one file to be deduped | |
echo | |
} | |
[ $# -lt 2 ] && { | |
echo "dedupe [input files]" | |
echo "- replace multiple unchanging" | |
echo copies of the same files with | |
echo hardlinks to save space | |
return 1 | |
} | |
declare -a hashset | |
hashset=() | |
declare -a baseset | |
baseset=() | |
# i feel like i'm missing something, even before | |
# i just capped off writing the code for this | |
NUL=/dev/null | |
wpathfail() { echo $2; } | |
fsize() { stat -c "%s" "$1"; } | |
dsize() { df --output=source,avail --block-size=M --total | tail -1; } # prints kb | |
wpath=wpathfail | |
which cygpath 2>$NUL >$NUL && wpath=cygpath || { | |
which wslpath 2>$NUL >$NUL && wpath=wslpath || { | |
# wtf to do here | |
echo No path converter utility found. 1>&2 | |
} | |
} | |
copycount=0 | |
before=$(dsize) | |
for ff in "$@"; do | |
f=$($wpath -u "$ff") | |
if [ -e "$f" ] && { | |
test=($(sha256sum "${f}")) #weird | |
test="${test[1]:1}" | |
# INDEXES START FROM 1, ABSOLUTE CRINGE | |
check=${hashset[(Ie)$test]:-0} | |
[ $check -eq 0 ] && { | |
hashset+=($test) | |
baseset+=("$f") | |
#baseset+=("$(realpath "$f")") | |
} || { | |
base="${baseset[$check]}" | |
[ $(fsize "$f") -eq $(fsize "$base") ] && { | |
copycount=$(($copycount + 1)) | |
ln -f "$base" "$f" | |
echo "[${test:0:15}] $(basename "$base") <- $f" | |
} || { | |
echo "$base and $(basename "$f") have matching hashes but different size!!!!" 1>&2 | |
} | |
} | |
} || { | |
echo "$f is not a file." | |
} | |
done | |
after=$(dsize) | |
echo Free space: | |
echo Before: $before | |
echo After : $after | |
uniqcount=${#hashset[@]} | |
echo Found $uniqcount unique files, $copycount duplicates | |
return 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment