Skip to content

Instantly share code, notes, and snippets.

@wbob
Created January 25, 2019 10:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wbob/394acb854d2701c6391911a8dd85209a to your computer and use it in GitHub Desktop.
Save wbob/394acb854d2701c6391911a8dd85209a to your computer and use it in GitHub Desktop.
shopware-media-cleanup

Requirements

Tested with bash 4.3 on ubuntu trusty/xenial.

Description

Cleanup Shopware media/image/ directory of misplaced files. By default only folders exceeding the 3 directory levels are scanned.

  • copy the script to your home folder
  • navigate to a shoproot and into media/image/
  • make sure config.php exists or symlinks to the config_$env.php with the valid database
  • run with ~/media-cleanup.sh | tee ~/media-cleanup.log
  • check the log or output, then use --real-run for execution

optional:

  • for big result logs cleanup-stats.sh ~/media-cleanup.log gives an overview
  • if you want to scan the whole of media/image/ use the basedir or all policies. Review the log and script beforehand, as thumbnail heuristics are error prone and other filetypes than JPG, PNG and GIF are not accounted for
#!/bin/bash
#
# usage: cleanup-stats.sh media-cleanup.log
total=$(wc -l $1)
count_images=$(grep "image: " $1 | wc -l)
count_thumbs=$(grep "thumb: " $1 | wc -l)
count_images_correct=$(grep "image: " $1 | grep " correct" | wc -l)
count_thumbs_correct=$(grep "thumb: " $1 | grep " correct" | wc -l)
count_images_misplaced_moved=$(grep "image: " $1 | grep mismatch | grep -v proper | wc -l)
count_images_misplaced_duplicate=$(grep "image: " $1 | grep mismatch | grep proper | wc -l)
count_thumbs_misplaced_moved=$(grep "thumb: " $1 | grep mismatch | grep -v proper | wc -l)
count_thumbs_misplaced_duplicate=$(grep "thumb: " $1 | grep mismatch | grep proper | wc -l)
echo "total: $total"
echo "count_images: $count_images"
echo "count_thumbs: $count_thumbs"
echo "count_images_correct: $count_images_correct"
echo "count_thumbs_correct: $count_thumbs_correct"
echo "count_images_misplaced_moved: $count_images_misplaced_moved"
echo "count_images_misplaced_duplicate: $count_images_misplaced_duplicate"
echo "count_thumbs_misplaced_moved: $count_thumbs_misplaced_moved"
echo "count_thumbs_misplaced_duplicate: $count_thumbs_misplaced_duplicate"
#!/bin/bash
#
# Cleanup shopware media/image/ directory for misplaced files. Either obvious
# violators (more then 3 levels deep), the image base directory or everything.
#
# Under certain circumstances[1] from 5.1 (SW-12620) until 5.2.8 the media
# fallback would move ("migrate") directories and files to unreachable paths.
# This was fixed in SW-16467 (https://github.com/shopware/shopware/commit/737ebc79)
# and a config setting since 5.3 (liveMigration -> false)
#
# [1] the unfortunate case was a mail scanner/crawler requesting all the parent
# directories of imagelinks contained in a just sent out newsletter, causing
# the immediate disappearence of those images. Look out for these log entries:
# core.ERROR: Legacy media url detected.
# {"requestedUrl":"/media/image/19/85/90",
# "redirectedTo":"https://example.com/media/image/e2/13/9b/90"}
#
# As thumbnail detection "in hindsight" is not accurate, use at your own risk.
# Only the database can really tell source image and thumbnail apart.
# echo usage mode
if [ -z "$*" ]; then
echo "Choose search policy: exceeding (default), basedir, all (risky)"
echo "Usage: ~/bin/media-cleanup.sh {policy} (--real-run) | tee ~/media-cleanup.log"
echo ""
echo "Default is report mode only, use --real-run to execute changes."
echo ""
elif [ "$2" = '--real-run' ]; then
echo "real-run. cleaning up.."
fi
# safety check
if echo $PWD | grep -qE 'media\/image$'; then
:
else
echo "Script not run from media/image/, please navigate to that directory. Exiting.."
exit
fi
# thumbnail detection
#resolutions="57x57 140x140 200x200 600x600 800x800 1280x1280 1920x1920"
if [ -z "$resolutions" ]; then
# read db credentials
for var in username dbname password; do
export $var="$(php --run '$loader = require("../../autoload.php"); $config = include("../../config.php"); print_r($config[db]['"$var"']);' 2>/dev/null)"
done
# query for all possible thumbnail dimensions, add non-explicit thumbnail sizes
resolutions=$(mysql -u $username -p$password $dbname -B -N -e \
"select concat(thumbnail_size, '57x57 140x140') from s_media_album_settings group by thumbnail_size;" |\
grep -o -P "\d{2,4}x\d{2,4}" | sort -n | uniq | tr "\r\n" " ")
fi
# safety check
if echo "$resolutions" | grep -q "140x140"; then
:
else
echo ''
echo 'Database fetch failed, define path to config_$env.php or set $resolutions at lines 46 or 42 manually'
exit 1
fi
# fill the variables and trim right pipe, will be used in a "grep -E" group
for entry in $resolutions; do
thumbjpg+="_$entry\.jpg|"
thumbpng+="_$entry\.png|"
thumbgif+="_$entry\.gif|"
done
thumbjpg=${thumbjpg%"|"}
thumbpng=${thumbpng%"|"}
thumbgif=${thumbgif%"|"}
# generate md5path, honoring blacklist for adblockers ad -> g0
# engine/Shopware/Bundle/MediaBundle/Strategy/Md5Strategy.php#L36
function md5strategy() {
md5sum | cut -c1-6 | sed 's/.\{2\}/&\//g' | sed 's/ad\//g0\//g'
}
# file command introduces I/O, less confident on png files. Other
# media files than jpg/png not tested
function isthumbnail() {
if echo "$fullpath" | grep -q -P '_\d{2,4}x\d{2,4}(@2x)?.\w{3,4}$'; then
if
echo "$fullpath" | grep -q -P '@2x.\w{3}$'; then
return 0
elif
echo "$fullpath" | grep -q -E "$thumbjpg" \
&& file $fullpath | grep -q 'CREATOR: gd-jpeg' ; then
return 0
elif
echo "$fullpath" | grep -q -E "$thumbpng" \
&& file $fullpath | grep -q "non-interlaced"; then
return 0
elif
echo "$fullpath" | grep -q -E "$thumbgif" \
&& file $fullpath | grep -q "version 87a"; then
return 0
else
return 1
fi
else
return 1
fi
}
# do the actual work
IFS="|"
while read -r origpath fullpath filename
do
# no paths given
if [ -z $origpath ]; then
exit 0;
fi
# is it a thumbnail or a sourceimage, assign path
if echo "$fullpath" | isthumbnail; then
mediapath="media/image/thumbnail/$filename"
mediatype="thumb"
else
mediapath="media/image/$filename"
mediatype="image"
fi
md5path=$(echo -n "$mediapath" | md5strategy )
# does the found path equal the should-be path by name (and level)
if echo "$origpath/" | grep -Fxq "./$md5path"; then
continue
#echo "$mediatype: $fullpath correct"
else
echo -n "$mediatype: $fullpath mismatch, should be at $md5path"
if [ ! -f "$md5path$filename" ]; then
# moving the file, verbose and non-overwriting/update-only
if [ "$2" = '--real-run' ]; then
mkdir -p "$md5path"
echo -n " move: "
mv -v -u "$fullpath" "$md5path"
# the misplaced file can be safely deleted at its origin now
rm -f "$fullpath"
else echo
fi
else
echo -n ", but exists at proper target already. real-run for delete. "
if [ "$2" = '--real-run' ]; then
rm -v "$fullpath"
else echo
fi
fi
fi
# define scope of cleanup, for the parantheses in the options see stackoverflow.com/a/34503049
done < <(
case "$1" in
(exceeding|"")
# find obvious violators exceeding the 3-folder deep md5 structure
# this is the default
find . -mindepth 5 -type f -printf "%h|%h/%f|%f\n"
;;
(basedir)
# check only media/image/ and media/image/thumbnail/
find . -maxdepth 2 -type f -printf "%h|%h/%f|%f\n"
;;
(all)
# find everything
find . -type f -printf "%h|%h/%f|%f\n"
;;
esac)
# this deletes hollow directorytrees. The splitted md5 hash directory structure
# can make up to 64GB at 4096 bytes per dir: 4096*4096*4096
if [ "$2" = '--real-run' ]; then
echo "deleting emptydirs"
for n in 5 4 3 2 1; do
find . -mindepth $n -type d -empty -delete
done
else
emptydirs=$(find . -mindepth 1 -type d -empty | wc -l)
echo "count of emptydirs: $emptydirs"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment