Skip to content

Instantly share code, notes, and snippets.

@donnaken15
Last active June 10, 2024 11:21
Show Gist options
  • Save donnaken15/f95e8a143bb330fcf7d6268a4d6929e8 to your computer and use it in GitHub Desktop.
Save donnaken15/f95e8a143bb330fcf7d6268a4d6929e8 to your computer and use it in GitHub Desktop.
Group duplicates of files into hardlinks, check similarity using BLAKE2/SHA256 and matching file size, primarily for Windows/Cygwin/MSYS2
#!/bin/zsh
[ $# -eq 1 ] && {
echo you must specify more than one file to be deduped
echo
}
[ $# -lt 2 ] && {
echo "dedupe [input files]"
echo "- replace multiple unchanging"
echo copies of the same files with
echo hardlinks to save space
echo "- as of now, it is recommended to"
echo perform this script only on files
echo that exist on a singular drive
exit 1
}
declare -a hash_params
#----------------------------------------------------------------------------------------
# whatever you want to config
# each pass processes the entire list of arguments provided
passes=4
# pass 1, 2: standard, plainly hardlink exact copies of files
# pass 3, 4: check inputs that are already/just became hardlinks and merge seperate copies
# of the same file that each have their own group of hardlinks and merge those groups
# it's too stupid to explain what happened when it gets to this point
#
# if one of the subsequent passes don't process anymore files, exit script
# blake2, security up there with sha-3 plus performance
hash=b2
hash_params=(-b -l 80) # being conservative for big files
#
#----------------------------------------------------------------------------------------
#hash2=sha256 # not using right now
NUL=/dev/null
[ $passes -gt 0 ] || {
echo "Invalid config: passes = ${passes}." 1>&2
echo "Setting to 4." 1>&2
passes=4
}
which fsutil.exe 2>$NUL >$NUL || {
echo "Cannot find fsutil, setting passes to 1."
passes=1
}
{ which ${hash}sum } 2>$NUL >$NUL || {
echo "Cannot find ${hash}sum as a hashing program." 1>&2
hash=sha256
#hash2=sha384
} # && which ${hash2}sum;
{ which ${hash}sum } 2>$NUL >$NUL || {
echo "Fallback hashing program ${hash}sum does not exist. Aborting..." 1>&2
exit 1
}
hashset=()
#hashset2=()
baseset=()
echo dedupe \($# files\)
#profile=(date +%s%N)
#profileend() { echo profile script: $(($(($($profile) - $1)) / 1000000000.0)) }
wpathfail() { echo $2; }
dsize() { df --output=avail --block-size=M --total | tail -1; } # prints kb
fsize=(stat -c%s) # attempts to reduce amount of new processes, but substitution will always require it
ctime=(stat -c%W) # creation time
inode=(stat -c%i) # location of the data in the raw drive
wsl_path() {
wslpath $1 "$(wsl2exe cygpath -w "$2")"
}
wpath=wpathfail
wpath=$(which cygpath 2>$NUL) || {
which wslpath 2>$NUL >$NUL && wpath=wsl_path || {
# wtf to do here
echo "No path converter utility found." 1>&2
exit 1
}
}
dls=() # drive letters
mps=() # mount points
(mount | sed -n 's/^\(\w:\)\\\?\son\s\(\/cygdrive\/\w\|\/mnt\/\w\|\/\w\).*/\1 \2/p') |
while read -r lt; do
l=(${(@s: :)lt}) # absurd
dls+=(${l[1]}) # C:
mps+=(${l[2]:l}) # /cygdrive/c/
done
mntpt() {
for i in {1..${#mps}}; do
l=${1:l} && [ -z "${l%%${mps[$i]}/*}" ] && echo "${dls[$i]}" && return 0
done
echo "Unknown drive ($1)" 1>&2
return 1
}
which fsutil.exe 2>$NUL >$NUL
fsutil=$?
err=""
total=0
copycount=0
before=$(dsize)
rc=""
for pass in {1..$passes}; do
# some stuff doesn't get hardlinked for some reason but running a second time works (old comment)
batched=0
for ff in "$@"; do
# WHY DOES PATH TOOL CHANGE DRIVE LETTER SOMETIMES?!?!?!?!!!
f="$($wpath -u "$ff")"
[ ! -e "$f" -o -d "$f" ] && {
[ $pass -eq 1 ] && echo "${err}$f is not a file." 1>&2
continue
}
# prof=$($profile)
# profileend $prof
f=$(realpath "$f")
test=$(${hash}sum ${hash_params} "${f}" | cut -d ' ' -f 1)
# test2=$(${hash2}sum -b "${f}" | cut -d ' ' -f 1) # fallback if collision just because
# INDEXES START FROM 1, ABSOLUTE CRINGE
check=${hashset[(Ie)$test]:-0}
# check2=${hashset2[(Ie)$test2]:-0} # -a $check -eq $check2
[ $check -eq 0 ] && {
hashset+=($test)
baseset+=("$f")
continue
# baseset+=("$(realpath "$f")")
}
# FORGOT, NEED TO DISABLE DIFFERENT DRIVES CONFLICTING IN THIS PLAIN ARRAY
base=$(realpath "${baseset[$check]}")
[ "$base" = "$f" ] && {
[ $pass -eq 1 ] && \
echo "${err}$(basename $f) cannot be linked to itself." 1>&2
continue
}
{
d=$(mntpt "$f") && \
bd=$(mntpt "$base") && \
[ $d = $bd ]
} || {
[ $pass -eq 1 ] && echo "${err}Drives do not match for ${d}\\...\\$(basename "$f") and ${bd}\\...\\$(basename "$base")" 1>&2
continue
}
[ $($inode "$f") = $($inode "$base") ] && {
[ $pass -eq 1 ] && \
echo "${err}$(basename "$f") is already hardlinked." 1>&2
continue
}
{
[ $($fsize "$f") = $($fsize "$base") ] || {
echo "${err}[${hashset[$check]:0:15}, $($fsize "$f")] $(basename "$f") and [${test:0:15}, $($fsize "$base")] $(basename "$base") have matching hashes but different size!!!!" 1>&2
continue
}
[ $copycount -eq 0 ] && echo Deduping...
copycount=$(($copycount + 1))
[ $($ctime "$f") -gt $($ctime "$base") ] && { # absurd but muh archives/history reasons
# echo newer file
target="$f"
source="$base"
} || {
# echo older file
target="$base"
source="$f"
}
# [ $pass -eq 4 ] && { # just give up
# swap="$target"
# target="$base"
# source="$swap"
# }
[ $batched -eq 0 -a $passes -gt 1 ] && echo "Pass ${pass}${rc}"
[ $pass -gt 2 ] && {
[ $fsutil -eq 0 ] && {
target="$($wpath -m "$target" | sed 's/\//\\\\/g')"
fsutil.exe hardlink list "$target" >$NUL && {
ffs=0
fsutil.exe hardlink list "$target" | while read -r hl; do
test2=$($wpath -u "$d${hl[0,-2]}") &&
[ ! "$($wpath -u "$target")" = "$($wpath -u "$test2")" ] &&
ln -f $source $test2 && ffs=$(($ffs + 1)) &&
batched=$(($batched + 1)) &&
echo "[${test:0:15}]${rc} $(basename "$source")${rc} <-${rc} $($wpath "$test2")${rc} (group merge)"
done
[ $ffs -gt 0 ]
} || echo "${err}Cannot find hardlinks for $target.${rc}" 1>&2
}
}
ln -f "$source" "$target"
} && {
#${test2:0:7}
echo "[${test:0:15}]${rc} $(basename "$base")${rc} <-${rc} $($wpath "$f")${rc}"
total=$(($total + 1))
batched=$(($batched + 1))
}
done
[ $batched -eq 0 ] && break
done
after=$(dsize)
echo Free space:
echo Before: $before
echo After : $after
uniqcount=${#hashset[@]}
echo Found $uniqcount unique files, $copycount duplicates, $total total
#read -rsn
exit 0
@donnaken15
Copy link
Author

Example logs:
Deduping my xampp folder, finds multiple copies in a row
Encountering already deduped files
Three passes

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment