Created
July 23, 2020 13:01
-
-
Save agrajag9/b9c56a7df4f732d5d2121da26b34d52a to your computer and use it in GitHub Desktop.
Find duplicate files by MD5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
echo "Storing tmp csvs in /tmp/dupes.$$/" | |
mkdir -p /tmp/dupes.$$ | |
echo -n "Hashing files... " | |
find $(pwd) -mindepth 1 -maxdepth 1 -type f \ | |
| while IFS= read FULLPATH; do \ | |
MD5=$( md5 -q "${FULLPATH}" ) | |
MTIME=$( stat -f %m "${FULLPATH}" ) | |
echo "$MD5,$MTIME,$FULLPATH" | |
done > /tmp/dupes.$$/md5_mtime_fullpath.csv | |
echo "Done!" | |
echo -n "Identify duplicate hashes... " | |
cut -d, -f1 /tmp/dupes.$$/md5_mtime_fullpath.csv \ | |
| sort \ | |
| uniq -c \ | |
| while IFS=' ' read COUNT MD5; do \ | |
if [ $COUNT -gt 1 ]; then | |
echo $COUNT,$MD5 | |
fi | |
done > /tmp/dupes.$$/count_md5.csv | |
echo "Done!" | |
COUNT_DUPES=$( wc -l /tmp/dupes.$$/count_md5.csv | awk '{print $1}' ) | |
echo "$COUNT_DUPES duplicate MD5s found!" | |
cut -d, -f2 /tmp/dupes.$$/count_md5.csv \ | |
| while IFS=, read MD5; do \ | |
OLDEST=$( grep $MD5 /tmp/dupes.$$/md5_mtime_fullpath.csv \ | |
| cut -d, -f2- \ | |
| sort -n \ | |
| tail -n1 \ | |
| cut -d, -f2- | |
) | |
NEWEST=$( grep $MD5 /tmp/dupes.$$/md5_mtime_fullpath.csv \ | |
| cut -d, -f2- \ | |
| sort -n \ | |
| head -n1 \ | |
| cut -d, -f2- | |
) | |
echo "Oldest with MD5 $MD5 is $OLDEST" | |
echo "Newest with MD5 $MD5 is $NEWEST" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment