Skip to content

Instantly share code, notes, and snippets.

@netvor
Created March 26, 2010 16:21
Show Gist options
  • Save netvor/345065 to your computer and use it in GitHub Desktop.
Save netvor/345065 to your computer and use it in GitHub Desktop.
#!/bin/bash
#Leave empty if not verbose
VERBOSE=
#Don't change
SECONDENTRYSUFFIX=__without_tags__
#Where to store the data files. A subdir /detailed/ and /duplicates/ must exist
OUTDIR=/dir/to/store/output/in/
#Counters
NUMFILES=0
NUMHASHES=0
DATESTART=$(date '+%s')
#Change these for your situation
cd /dir/to/start/find/in/
FILES="* .[^.]*"
#Move old .hashes.txt out of the way and open it for reading on file descriptor 8
mv $OUTDIR/duplicates/.hashes.txt $OUTDIR/duplicates/.hashes.old.txt
exec 8<$OUTDIR/duplicates/.hashes.old.txt
#Open new .hashes.txt on file descriptor 9
exec 9>$OUTDIR/duplicates/.hashes.txt
#Open a temp file for the MP3's
TMP=$(mktemp)
#Go through the records. The input to this loop is given at the end of the loop, in a subshell.
while read -r nDI nM nS nN
do
[[ $VERBOSE ]] && printf "Processing file %s..." "$nN"
#advance the record from fd8
while [[ $oDI < $nDI ]] && read -r -u8 NEWoDI NEWoM NEWoS NEWoH NEWoN
do
oDI=$NEWoDI
oM=$NEWoM
oS=$NEWoS
oH=$NEWoH
oN=$NEWoN
[[ $VERBOSE ]] && printf 'Advancing to %s...' "$oN"
done
#Check if old hash is valid
if [[ $nDI\ $nM\ $nS == $oDI\ $oM\ $oS ]]
then
[[ $VERBOSE ]] && echo "Skipping, equal to $oN."
#print OLD hash with the NEW name to fd9
printf '%s\t%s\t%12s\t%s\t%s\n' "$nDI" "$nM" "$nS" "$oH" "$nN" >&9
((NUMFILES++))
#there may be a "second entry" (specifically, if the file is .mp3), so advance the old file by one line and check for it
if read -r -u8 NEWoDI NEWoM NEWoS NEWoH NEWoN
then
oDI=$NEWoDI
oM=$NEWoM
oS=$NEWoS
oH=$NEWoH
oN=$NEWoN
[[ $VERBOSE ]] && printf 'Advancing to %s...' "$oN"
#Check if this line has the same (DI)(M) as the previous. Not (S), since that is recomputed
if [[ $nDI\ $nM == $oDI\ $oM ]]
then
[[ $VERBOSE ]] && echo "Valid second entry found!"
#print OLD hash with the NEW UPDATED name and OLD size to fd9
printf '%s\t%s\t%12s\t%s\t%s%s\n' "$nDI" "$nM" "$oS" "$oH" "$nN" "$SECONDENTRYSUFFIX" >&9
((NUMFILES++))
else
[[ $VERBOSE ]] && echo " ."
fi
fi
else
[[ $VERBOSE ]] && echo "Computing."
#Compute hash of file
# Since the file is quoted and escaped, unquote it and run it through printf to get the "real" filename, then via xargs to cat and sha1sum
# The benefit of piping through cat is that there is a fallback: if the file can't be read it will get a default checksum=sha1sum("")
nH=$(printf "${nN:1:$((${#nN}-2))}" | xargs -0 cat | sha1sum | head -c40)
#print new entry to fd9
printf '%s\t%s\t%12s\t%s\t%s\n' "$nDI" "$nM" "$nS" "$nH" "$nN" >&9
((NUMHASHES++))
((NUMFILES++))
#Check if file is MP3
if [[ $nN =~ \.[Mm][Pp]3\'$ ]]
then
[[ $VERBOSE ]] && printf " --> Found MP3 file..."
#make a temp copy of the file
printf "${nN:1:$((${#nN}-2))}" | xargs -0rI '////' cp "////" $TMP
#Strip tags, suppressing output if not verbose
[[ $VERBOSE ]] && id3v2 -D $TMP || id3v2 -D $TMP >/dev/null
#get the new checksum
mH=$(sha1sum $TMP | head -c40)
#get the new size
mS=$(stat --printf='%s' $TMP)
#print new entry to fd9
printf '%s\t%s\t%12s\t%s\t%s%s\n' "$nDI" "$nM" "$mS" "$mH" "$nN" "$SECONDENTRYSUFFIX" >&9
((NUMHASHES++))
((NUMFILES++))
fi
fi
#Use BASH-only syntax to now call the INPUT to the while-read loop as a subshell
done < <(
#Execute find, print out the "detailed report", then select regular files
find -H $FILES -fls $OUTDIR/detailed/find-ls.root.txt -type f -print0 |
#Call stat on the files, fetching the following data:
# Device number and inode number (DI)
# Modification time, as seconds since epoch (M)
# Size in bytes (S)
# Filename, quoted and escaped (N)
xargs -0 stat --printf='%010d.%010i\t%012Y\t%s\t%N\n' |
#Sort the output, effectively by (DI)
sort
#End the subshell
)
#Cleanup: close fd9 and remove the tempfile
9>&-
rm $TMP
#Phase 2: sort the final .hashes.txt by size and hash and print out duplicates
#Throw away (DI)(M), keep (S)(H)(N)
cut -c36- $OUTDIR/duplicates/.hashes.txt |
#Sort the files, biggest first
sort -rn |
#Duplicates are found when two lines are equal in the first 53 characters (12 for (S), 40 for (H) and one tab)
uniq -w54 --all-repeated=separate > $OUTDIR/duplicates/duplicates.txt
#Phase 3: write out statistics
DATESTOP=$(date '+%s')
DATENOW=$(date)
printf '%s ::%11d files processed,%11d hashes calculated,%6d seconds\n' \
"$DATENOW" $NUMFILES $NUMHASHES $(($DATESTOP-$DATESTART)) >> $OUTDIR/duplicates/.statistics.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment