Last active
June 10, 2024 11:21
-
-
Save donnaken15/f95e8a143bb330fcf7d6268a4d6929e8 to your computer and use it in GitHub Desktop.
Group duplicates of files into hardlinks, check similarity using BLAKE2/SHA256 and matching file size, primarily for Windows/Cygwin/MSYS2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
[ $# -eq 1 ] && { | |
echo you must specify more than one file to be deduped | |
echo | |
} | |
[ $# -lt 2 ] && { | |
echo "dedupe [input files]" | |
echo "- replace multiple unchanging" | |
echo copies of the same files with | |
echo hardlinks to save space | |
echo "- as of now, it is recommended to" | |
echo perform this script only on files | |
echo that exist on a singular drive | |
exit 1 | |
} | |
declare -a hash_params | |
#---------------------------------------------------------------------------------------- | |
# whatever you want to config | |
# each pass processes the entire list of arguments provided | |
passes=4 | |
# pass 1, 2: standard, plainly hardlink exact copies of files | |
# pass 3, 4: check inputs that are already/just became hardlinks and merge seperate copies | |
# of the same file that each have their own group of hardlinks and merge those groups | |
# it's too stupid to explain what happened when it gets to this point | |
# | |
# if one of the subsequent passes don't process anymore files, exit script | |
# blake2, security up there with sha-3 plus performance | |
hash=b2 | |
hash_params=(-b -l 80) # being conservative for big files | |
# | |
#---------------------------------------------------------------------------------------- | |
#hash2=sha256 # not using right now | |
NUL=/dev/null | |
[ $passes -gt 0 ] || { | |
echo "Invalid config: passes = ${passes}." 1>&2 | |
echo "Setting to 4." 1>&2 | |
passes=4 | |
} | |
which fsutil.exe 2>$NUL >$NUL || { | |
echo "Cannot find fsutil, setting passes to 1." | |
passes=1 | |
} | |
{ which ${hash}sum } 2>$NUL >$NUL || { | |
echo "Cannot find ${hash}sum as a hashing program." 1>&2 | |
hash=sha256 | |
#hash2=sha384 | |
} # && which ${hash2}sum; | |
{ which ${hash}sum } 2>$NUL >$NUL || { | |
echo "Fallback hashing program ${hash}sum does not exist. Aborting..." 1>&2 | |
exit 1 | |
} | |
hashset=() | |
#hashset2=() | |
baseset=() | |
echo dedupe \($# files\) | |
#profile=(date +%s%N) | |
#profileend() { echo profile script: $(($(($($profile) - $1)) / 1000000000.0)) } | |
wpathfail() { echo $2; } | |
dsize() { df --output=avail --block-size=M --total | tail -1; } # prints kb | |
fsize=(stat -c%s) # attempts to reduce amount of new processes, but substitution will always require it | |
ctime=(stat -c%W) # creation time | |
inode=(stat -c%i) # location of the data in the raw drive | |
wsl_path() { | |
wslpath $1 "$(wsl2exe cygpath -w "$2")" | |
} | |
wpath=wpathfail | |
wpath=$(which cygpath 2>$NUL) || { | |
which wslpath 2>$NUL >$NUL && wpath=wsl_path || { | |
# wtf to do here | |
echo "No path converter utility found." 1>&2 | |
exit 1 | |
} | |
} | |
dls=() # drive letters | |
mps=() # mount points | |
(mount | sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\|\/mnt\/\w\|\/\w\).*/\1 \2/p') | | |
while read -r lt; do | |
l=(${(@s: :)lt}) # absurd | |
dls+=(${l[1]}) # C: | |
mps+=(${l[2]:l}) # /cygdrive/c/ | |
done | |
mntpt() { | |
for i in {1..${#mps}}; do | |
l=${1:l} && [ -z "${l%%${mps[$i]}/*}" ] && echo "${dls[$i]}" && return 0 | |
done | |
echo "Unknown drive ($1)" 1>&2 | |
return 1 | |
} | |
which fsutil.exe 2>$NUL >$NUL | |
fsutil=$? | |
err="[91;1m" | |
total=0 | |
copycount=0 | |
before=$(dsize) | |
rc="[0m" | |
for pass in {1..$passes}; do | |
# some stuff doesn't get hardlinked for some reason but running a second time works (old comment) | |
batched=0 | |
for ff in "$@"; do | |
# WHY DOES PATH TOOL CHANGE DRIVE LETTER SOMETIMES?!?!?!?!!! | |
f="$($wpath -u "$ff")" | |
[ ! -e "$f" -o -d "$f" ] && { | |
[ $pass -eq 1 ] && echo "${err}$f is not a file.[0m" 1>&2 | |
continue | |
} | |
# prof=$($profile) | |
# profileend $prof | |
f=$(realpath "$f") | |
test=$(${hash}sum ${hash_params} "${f}" | cut -d ' ' -f 1) | |
# test2=$(${hash2}sum -b "${f}" | cut -d ' ' -f 1) # fallback if collision just because | |
# INDEXES START FROM 1, ABSOLUTE CRINGE | |
check=${hashset[(Ie)$test]:-0} | |
# check2=${hashset2[(Ie)$test2]:-0} # -a $check -eq $check2 | |
[ $check -eq 0 ] && { | |
hashset+=($test) | |
baseset+=("$f") | |
continue | |
# baseset+=("$(realpath "$f")") | |
} | |
# FORGOT, NEED TO DISABLE DIFFERENT DRIVES CONFLICTING IN THIS PLAIN ARRAY | |
base=$(realpath "${baseset[$check]}") | |
[ "$base" = "$f" ] && { | |
[ $pass -eq 1 ] && \ | |
echo "${err}$(basename $f) cannot be linked to itself.[0m" 1>&2 | |
continue | |
} | |
{ | |
d=$(mntpt "$f") && \ | |
bd=$(mntpt "$base") && \ | |
[ $d = $bd ] | |
} || { | |
[ $pass -eq 1 ] && echo "${err}Drives do not match for ${d}\\...\\$(basename "$f") and ${bd}\\...\\$(basename "$base")" 1>&2 | |
continue | |
} | |
[ $($inode "$f") = $($inode "$base") ] && { | |
[ $pass -eq 1 ] && \ | |
echo "${err}$(basename "$f") is already hardlinked.[0m" 1>&2 | |
continue | |
} | |
{ | |
[ $($fsize "$f") = $($fsize "$base") ] || { | |
echo "${err}[${hashset[$check]:0:15}, $($fsize "$f")] $(basename "$f") and [${test:0:15}, $($fsize "$base")] $(basename "$base") have matching hashes but different size!!!!" 1>&2 | |
continue | |
} | |
[ $copycount -eq 0 ] && echo Deduping... | |
copycount=$(($copycount + 1)) | |
[ $($ctime "$f") -gt $($ctime "$base") ] && { # absurd but muh archives/history reasons | |
# echo newer file | |
target="$f" | |
source="$base" | |
} || { | |
# echo older file | |
target="$base" | |
source="$f" | |
} | |
# [ $pass -eq 4 ] && { # just give up | |
# swap="$target" | |
# target="$base" | |
# source="$swap" | |
# } | |
[ $batched -eq 0 -a $passes -gt 1 ] && echo "[92;1mPass ${pass}${rc}" | |
[ $pass -gt 2 ] && { | |
[ $fsutil -eq 0 ] && { | |
target="$($wpath -m "$target" | sed 's/\//\\\\/g')" | |
fsutil.exe hardlink list "$target" >$NUL && { | |
ffs=0 | |
fsutil.exe hardlink list "$target" | while read -r hl; do | |
test2=$($wpath -u "$d${hl[0,-2]}") && | |
[ ! "$($wpath -u "$target")" = "$($wpath -u "$test2")" ] && | |
ln -f $source $test2 && ffs=$(($ffs + 1)) && | |
batched=$(($batched + 1)) && | |
echo "[95;1m[${test:0:15}]${rc} [97m$(basename "$source")${rc} [93;1m<-${rc} [96;1m$($wpath "$test2")${rc} (group merge)" | |
done | |
[ $ffs -gt 0 ] | |
} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2 | |
} | |
} | |
ln -f "$source" "$target" | |
} && { | |
#${test2:0:7} | |
echo "[95;1m[${test:0:15}]${rc} [97m$(basename "$base")${rc} [93;1m<-${rc} [96;1m$($wpath "$f")${rc}" | |
total=$(($total + 1)) | |
batched=$(($batched + 1)) | |
} | |
done | |
[ $batched -eq 0 ] && break | |
done | |
after=$(dsize) | |
echo Free space: | |
echo Before: $before | |
echo After : $after | |
uniqcount=${#hashset[@]} | |
echo Found $uniqcount unique files, $copycount duplicates, $total total | |
#read -rsn | |
exit 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example logs:
![Deduping my xampp folder, finds multiple copies in a row](https://private-user-images.githubusercontent.com/20864393/338148543-814fd7dd-2c74-40dd-adae-1d57365e9407.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MjE3NzczNDUsIm5iZiI6MTcyMTc3NzA0NSwicGF0aCI6Ii8yMDg2NDM5My8zMzgxNDg1NDMtODE0ZmQ3ZGQtMmM3NC00MGRkLWFkYWUtMWQ1NzM2NWU5NDA3LnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA3MjMlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNzIzVDIzMjQwNVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTFmZTk0MzJlOGVlYWNiNDA3ZWUyMTE1NWY0YWFkNDc5MjI3OTg1YTU4NTZmM2JjNzJjMjRhNzU5YjBkZjhiNWYmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.FIXkM1V_tX8vSN49H2w8fXlL3gXk_Ci7Oq2XLkGDrk8)
![Encountering already deduped files](https://private-user-images.githubusercontent.com/20864393/338149304-497dee1b-b179-4bc8-a3b5-ca491217bbd2.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MjE3NzczNDUsIm5iZiI6MTcyMTc3NzA0NSwicGF0aCI6Ii8yMDg2NDM5My8zMzgxNDkzMDQtNDk3ZGVlMWItYjE3OS00YmM4LWEzYjUtY2E0OTEyMTdiYmQyLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA3MjMlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNzIzVDIzMjQwNVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTkyMDNjN2ZlYTIxZDQ4YTA2MWNjMTU2MzRhN2JiMGY4YTU1MjVlNzQ5NDJiMzMwYjIxNzJjNGEyNDgwMWI4NTQmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.fHBarLCtwACl3PhMPlSBHozDE93JX_AYRIsTSxaG96w)
![Three passes](https://private-user-images.githubusercontent.com/20864393/338160647-19b63f99-c3ee-4af3-8390-327056255a33.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MjE3NzczNDUsIm5iZiI6MTcyMTc3NzA0NSwicGF0aCI6Ii8yMDg2NDM5My8zMzgxNjA2NDctMTliNjNmOTktYzNlZS00YWYzLTgzOTAtMzI3MDU2MjU1YTMzLnBuZz9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNDA3MjMlMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjQwNzIzVDIzMjQwNVomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPTE5MjAzZGI2MWY0MTdlZmQ4ZGEwOGQ3YzQ2NmNhY2JmYTM5ZDU0MzBjYmM5ZThlYjcyNDVlNDQ4OGQ2NjhmZDQmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0JmFjdG9yX2lkPTAma2V5X2lkPTAmcmVwb19pZD0wIn0.ZSVszF-j9kzMPJfr0-Gt3fLq_AKQu3UYfq1loRn82wA)