#!/bin/bash | |
# TODO: skip tiny files (so small they couldn't be photos) | |
# TODO: make sure sym links and other file system oddities are handled | |
# TODO: look at paralellization for perf boost | |
# | |
# Constants | |
# | |
CHAR_COUNT=12 | |
BLOCK_COUNT=6 | |
SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position | |
COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives | |
DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv|jpeg)$" | |
# | |
# Parameters | |
# | |
if [ -z "$1" ] | |
then | |
PATTERN="$DEFAULT_PATTERN" | |
else | |
PATTERN=$1 | |
fi | |
# | |
# Introduction | |
# | |
echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes" | |
echo "The first $CHAR_COUNT chars of this hash are used to rename the file" | |
echo "" | |
# | |
# Get list and count of files. Confirm with user if we should proceed | |
# | |
files=$(find . -maxdepth 1 -type f | egrep -i "$PATTERN") | |
count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output | |
echo "Found $count files that match the pattern $PATTERN" | |
read -rp "Rename all? <Y/n> " prompt | |
if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]] | |
then | |
exit 0 | |
fi | |
echo "" | |
# | |
# For every file, compute a hash and rename | |
# | |
IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces | |
for f in $files | |
do | |
# Hash the full file | |
if [ $COMPUTE_FULL_HASH = true ] ; then | |
hash=$(md5 -q "$f") | |
# Hash an assortment of bytes | |
else | |
# Naiive: Just grab a continguous chunk of N blocks. But this could be all empty space or all metadata. Too many false positivies. | |
# bytes=$(dd if="$f" bs=512 count=$BLOCK_COUNT skip=$SKIP_START_BLOCKS 2> /dev/null) | |
# Skip along the file, sampling bytes as we go | |
bytes="" | |
for(( i=1; i<=BLOCK_COUNT; ++i )) do | |
let BLOCK=$i*$SKIP_SIZE | |
bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null) | |
done | |
hash=$(md5 <<< "$bytes") | |
fi | |
shortHash=$(echo "$hash" | cut -c1-$CHAR_COUNT) | |
ext=$(echo "$f" | sed 's/^.*\.//') | |
# If you've already run this script on some of these files, we shouldn't duplicate them. | |
if [[ $f == *"$shortHash"* ]] | |
then | |
echo "Skipping file. Name already contains the hash of its contents: $f" | |
continue | |
fi | |
newName="$shortHash.$ext" | |
# If a file with this name already exists, increment a number until it does not. | |
# This is a likely duplicate, and the whole reason for running this script | |
i=0 | |
while [ -f "$newName" ]; do | |
let i=i+1 | |
newName="$shortHash ($i).$ext" | |
done | |
echo "$newName <- $f" | |
mv "$f" "$newName" | |
done |
You might be able to make use of gnu parallel if it's available on the system already to speed up mass renames or calculating large numbers of hashes.
Using something to search for every file name and hash them all in parallel to then iterate through the names and rename all the files may yield for better speeds on server systems. .
would be useful to scan text files and rename references to these files... e.g. css files, html files, etc
First use-case (duplicate removal) can be covered by shell one-liner like this:
#!/bin/sh -eu
find "${1:-.}" -type f ! -empty -print0 | xargs -0 md5 -r | \
awk '$1 in a{sub("^.{33}","");printf "%s\0",$0}a[$1]+=1{}' | \
xargs -0 rm -v --
@artyom, dupes can be removed as a one liner, a good tip. I wanted something faster and with a confirmation step though. Some notes on doing it with a single command as you describe:
- That does a full md5 hash, which is slow for a lot of photos, or large files like videos. Unbeatable accuracy though.
- With false positives being a possibility given the quick hashing heuristic, I'd rather do a manual verification before I delete. I use this for photos and they have a lot of sentimental value for me.
@gravypod better performance sounds awesome. This took about 30 minutes to process 12,000
photos and videos directly on an SD card, so it could use a boost. Want to elaborate a bit?
Enhancement Ideas (open to suggestions):
renameToHash.sh asfd.jpg
to use.