Skip to content

Instantly share code, notes, and snippets.

@jhead
Last active August 29, 2015 14:16
Show Gist options
  • Save jhead/5c90a7fc881831e8f620 to your computer and use it in GitHub Desktop.
Save jhead/5c90a7fc881831e8f620 to your computer and use it in GitHub Desktop.
#!/bin/bash
if [ $# -lt 1 ]; then
echo "Usage: ./dedupe.sh [path]"
exit 1
fi
BASE=$1
MAX_SIZE="25M"
FILTER='\.(mca|log|gz)$'
cd $BASE
## Get list of files
echo -n "Finding files... "
files=$(find $(ls | egrep '^server[0-9]+$') -type f -size -$MAX_SIZE | egrep -v '\\' | egrep -vE $FILTER)
file_count=$(echo "$files" | wc -l)
echo "Done! Found $file_count."
## Compute hashes
echo "Computing hashes... "
hash_count=0
hash_output="/tmp/dedupe.tmp"
>$hash_output
for file in $files; do
if [ ! -e "$file" ]; then continue; fi
sha1sum "$file" 2>/dev/null >> $hash_output
hash_count=$((hash_count + 1))
## Display progress bar
if [ $(($hash_count % 10)) -eq 0 ]; then
progress=$((hash_count * 30 / file_count))
remaining=$((30 - progress))
echo -ne "\r["
# echo -n $(perl -E 'say "=" x '$progress)
a=0; while [ $a -lt $progress ]; do echo -n "="; a=$((a+1)); done
a=0; while [ $a -lt $remaining ]; do echo -n " "; a=$((a+1)); done
echo -n "] $hash_count / $file_count"
fi
done
echo
## Find duplicates
echo -n "Finding duplicates... "
dupe_output="/tmp/dedupe-dupes.tmp"
awk '{ print $1 }' $hash_output | sort | uniq -c | egrep -Ev "^(\s+)1\s[a-zA-Z0-9]{40}" | sort -n | awk '{ print $1" "$2 }' > $dupe_output
dupe_count=$(wc -l $dupe_output | cut -d' ' -f1)
echo "Done! Found $dupe_count."
## Calculate storage savings
echo -n "Performing calculations... "
total_sum=0
dupe_sum=0
while read i; do
count=$(echo "$i" | awk '{print $1}')
checksum=$(echo "$i" | awk '{print $2}')
file=$(grep $checksum $hash_output | head -1 | awk '{print $2}')
size=$(ls -l "$BASE/$file" | awk '{print $5}')
total_size=$(echo "$count * $size" | bc)
total_sum=$((total_sum + size))
dupe_sum=$((dupe_sum + total_size))
done < $dupe_output
savings=$((dupe_sum - total_sum))
savings=$((savings / 1024 / 1024))
total_dir_size=$(du -s $BASE | awk '{ print $1 }')
perc=$((savings * 100 / total_dir_size))
echo "Deduplicated savings: $savings MB ($perc%)"
## Clean up
rm -f $hash_output $dupe_output
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment