Skip to content

Instantly share code, notes, and snippets.

@benlk

benlk/cleanup.sh Secret

Created January 14, 2015 04:58
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benlk/0fe0c4c1d35db356b3d9 to your computer and use it in GitHub Desktop.
Save benlk/0fe0c4c1d35db356b3d9 to your computer and use it in GitHub Desktop.
Script used to clean Joomla posts
#!/bin/bash
#
# Usage:
# cleanup.sh
# This shell script uses sed and regular expressions to clean up a lot of HTML in posts
# This script creates a .tmp.txt that will be often overwritten.
#
# Revision 27
# Checks for existence of in.txt, the necessary input file
if [ ! -f "in.txt" ]
then
printf '\e[0;31mERR\e[0m: "in.txt" does not exist \n'
printf '\e[0;31mERR\e[0m: Please save your HTML in in.txt \n'
exit 1
fi
# Remove all the newlines, to make cleanup easier.
tr -d '[\b\f\n\r\t\v\a]' < in.txt > .tmp.txt
# Comments for this section are now inline.
# Newer versions of sed should be okay with the commented lines
# In the event your version of sed does not like this, remove all lines starting with a #
sed -Ee '
# replace non-breaking spaces with spaces
s/ / /g
#
# Get rid of other whitespace
s/>[ \s]</></g
#
# get rid of a classes in this blacklist
s/ class="[Aa]uthor[A-Za-z0-9\ \-]?"//g
s/ class="[Aa]uthor(-[Nn]ame)? para-style-override-[0-9]+"//g
s/ class="[Bb]ody-([A-Za-z0-9\-]*)?( para-style-override-[0-9]+)?"//g
s/ class="[Cc]aption"//g
s/ class="[Cc]itation-style"//g
s/ class="[Cc]aption para-style-override-[0-9]+"//g
s/ class="[Cc]har-style-override-[0-9]+"//g
s/ class="[Hh]eadline"//g
s/ class="[Ss]idebar-body-text"//g
s/ class="[Ss]tory"//g
# also get rid of all p class=""
s/p class="[^"]+"/p/g
#
# get rid of alignments
s/ align="[A-Za-z]+"//g
#
# get rid of the spans
s/<span class="[a-zA-Z0-9\ \-]+">//g
s/<span class="Apple-style-span">//g
s/<span style="[^"]+"[ ]?>//g
s/<span[ ]?>//g
s/<\/span>//g
#
# no more styles
s/ style="[^"]+"//g
#
# remove some more nbsp chars
s/&nbsp;&nbsp;/\&nbsp;/g
s/&nbsp;<\/p>/<\/p>/g
s/&nbsp;<br \/>/<br \/>/g
s/(&nbsp;( )?){3,}( )?//g
s/<p>(&nbsp;)+/<p>/g
s/<p><em>(&nbsp;)+<\/em><\/p>//g
s/<p><([a-zA-Z][a-zA-Z0-9]*)>(&nbsp;)+<\/\1>/<p>/g
s/<div( class="[a-z]+")?>(&nbsp;)+<\/div>//g
s/<br \/>(&nbsp;)?<br \/><\/li>/<\/li>/g
s/<br \/>(&nbsp;)?( )?<br \/>/<\/p><p>/g
#
# Remove some superfluous elements
s/<br data-mce-bogus="1" \/>//g
s/<p><\/p>//g
s/<p> <\/p>//g
s/<p> <\/p>//g
s/<p><br \/>/<p>/g
s/®//g
s/<em><br \/><\/em>/<br \/>/g
s/<em><\/em>//g
s/<\/em><em>//g
s/<em>[ ]?<\/em>//g
s/<strong><\/strong>//g
s/<\/strong><strong>//g
s/<em><strong><br[ ]?\/><\/strong><\/em>//g
s/<strong><br \/><\/strong>//g
s/<div class="[^m][^>]+>[ ]?[&nbsp;]?<\/div>//g
s/<div><\/div>//g
s/<div>&nbsp;<\/div>//g
s/<p><strong>&nbsp;<\/strong><\/p>//g
s/<sup><\/sup>//g
s/<p><strong><strong>(<img[^>]+>)<\/strong>([A-Za-z0-9])/<p>\1<\/p><p><strong>\2/g
#
# Remove mug tables? Does not work.
# http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/
# s/<table[^>]+><tbody><tr><td[^>]+>(<img [^>]+>)<\/td><td>&nbsp;<\/td><\/tr><tr><td>(<p[a-zA-Z0-9_\/<>"\:\@\.\;\&\ ]+)<\/td><\/tr><\/tbody><\/table>/<p>\1<\/p>\2/g
# \1 image
# \2 paragraph
#
# Shuffle elements around
s/<br[ ]?\/><\/a>/<\/a><br \/>/g
s/<br[ ]?\/><\/strong>/<\/strong><br \/>/g
s/<\/strong><br[ ]?\/>/<\/strong><\/p><p>/g
s/<strong><\/p><p>/<\/p><p><strong>/g
#s/\"\ \/>([A-Za-z0-9\u2013\u2014\u2018\u2019'"'"'"’]+)/" \/><\/p><p>\1/g
s/" \/>[ ]?([^ -~]?[A-Za-z0-9]+)/" \/><\/p><p>\1/
# p strong img /p p -> p img /p p strong, see "an alternative to laziness" http://www.regular-expressions.info/repeat.html
s/(<p>)<strong>(<img [^>]+><\/p><p>)/\1\2<strong>/g
s/<hr id="system-readmore" \/><\/p>/<\/p><hr id="system-readmore" \/>/g
s/<a ([^>]+)><br[ ]?\/>/<br \/><a \1>/g
s/(<p><a [^>]+><img [^>]+><\/a>)([A-Za-z0-9])/\1<\/p><p>\2/g
#
# all the mouseover images
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
s/ src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';"/ src="\4" alt="\2" width="575" height="auto"/g
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)"/ src="\1" alt="\4" width="575" height="auto"/g
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height=" " src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height="[a-zA-Z0-9\ ]+" width="[a-zA-Z0-9\ ]+" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
s/<img style="[a-zA-Z0-9:;#'"'"'\.\ \-]+" /<img /g
#
# fix image and iframe widths
s/width="5.." height="..."/width="100%" height="auto"/g
s/height="..." width="5.."/width="100%" height="auto"/g
s/width="385" height="([0-9auto]+)"/width="100%" height="auto"/g
s/height="([0-9auto]+)" width="385"/width="100%" height="auto"/g
s/<iframe width="100%" height="auto"/<iframe width="100%" height="323"/
#
# fix images that are inside <p>
s/<p><img ([^>]+)>([^<]+)/<p><img \1><\/p><p>\2/g
# and again, with strong and em!
s/<p><strong><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><strong>\2/g
s/<p><em><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><em>\2/g
# and now no more strong images! (when they are in their own p)
s/<p><strong><img( [^>]+)><\/strong><\/p>/<p><img\1><\/p>/g
# and now no more emphatic images! (when they are in their own p)
s/<p><em><img( [^>]+)><\/em><\/p>/<p><img\1><\/p>/g
#
# change alt text when it fits a certain pattern
# to give the name of the person
s/alt="00 ([a-z]+) ([a-z]+)( [a-z]+)?"/alt="\2 \1"/g
s/alt="00_([a-z]+)_([a-z]+)(_[a-z]+)?"/alt="\2 \1"/g
# To give table or figure titles
s/alt="([a-zA-Z0-9_\ ]+)(tb[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g
s/alt="([a-zA-Z0-9_\ ]+)(tbl[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g
s/alt="([a-zA-Z0-9_\ ]+)(fg[_\ ]?)([0-9])"/alt="Figure \3: \1\2\3"/g
#
# Re-add mugshot image style
s/ src="images\/stories\/mugs\// style="margin-right: 10px; float: left;" src="images\/stories\/mugs\//g
#
# Fix a certain broken end_mark.jpg link
s/%7Eprobeef\/images\/stories\/badges\/end_mark.jpg/images\/stories\/badges\/end_mark.jpg/g
# And the other one
s/file:\/\/\/Users\/judyhall\/Desktop\/[^/]+\/Beef_head_opt.jpeg/images\/stories\/badges\/end_mark.jpg/
s/alt="Beef_head.ai"/alt="end mark"/
#
# Make H2 tags exist (partially undone in case of photos)
# Currently does not work with ’ char inserted, but should be taken care of with unicode
s/<p><strong>([^<]+)<\/strong>[:]?<br \/>/<h2>\1<\/h2><p>/g
s/<p><strong>([^<]+)<br \/><\/strong>/<h2>\1<\/h2><p>/g
s/<p><strong>([^<]+)<\/strong><br \/>/<h2>\1<\/h2><p>/g
s/<p><strong>([^<]+)<\/strong><\/p>/<h2>\1<\/h2>/g
#
# Fix h2 in mugshots
s/(<p><img[^>]+><\/p>)<h2>([^<]+)<\/h2><p>([^<]+<br \/>)/\1<p><strong>\2<\/strong><br \/>\3/g
#
# grammar teachers hate this one simple trick
s/\. /\. /g
s/ / /g
s/p><em>-- /p><em>\&mdash;/g
#
# Standards:
# callto: is deprecated. See https://tools.ietf.org/html/rfc3966#section-7.3
s/href="callto:/href="tel:/g
s/href="call to[ ]?/href="tel:/g
#
# cleanup of photo headlines
s/([A-Z\ ]+) PHOTO(&nbsp;)?/\1/g
s/([A-Z\ ]+) RIGHT(&nbsp;)?:/\1:/g
s/([A-Za-z\ ]+) Right(&nbsp;)?:/\1:/g
s/([A-Za-z\ ]+) right(&nbsp;)?:/\1:/g
s/([A-Z]+) RIGHT ([A-Z]+)(&nbsp;)?:/\1 \2:/g
s/([A-Za-z]+) Right ([A-Za-z]+)(&nbsp;)?:/\1 \2:/g
s/([A-Za-z]+) right ([A-Za-z]+)(&nbsp;)?:/\1 \2:/g
s/([A-Z\ ]+) LEFT(&nbsp;)?:/\1:/g
s/([A-Za-z\ ]+) Left(&nbsp;)?:/\1:/g
s/([A-Z]+) MIDDLE(&nbsp;)?\:/MIDDLE \1\:/g
s/PHTOT /PHOTO /g
s/<h2>PHOTO([S]?)(&nbsp;)?[:]?[ ]?(&nbsp;)?<\/h2>/<p><strong>PHOTO\1<\/strong><\/p>/g
s/<h2>Photo(&nbsp;)?[:]?[ ]?<\/h2>/<p><strong>PHOTO<\/strong><\/p>/g
s/<h2>Photos(&nbsp;)?[:]?[ ]?<\/h2>/<p><strong>PHOTOS<\/strong><\/p>/g
#
# Re-adding whitespace and newlines to the story
s/<br \/>/<br \/>\
/g
s/<\/([a-zA-Z][A-Za-z0-9]?)><([a-zA-Z][A-Za-z0-9]?)/<\/\1>\
<\2/g
s/ \/><([a-zA-Z][A-Za-z0-9]?)/ \/>\
<\1/g
s/<([ou]l)><li>/<\1>\
<li>/g
s/<\/li><\/([ou]l)>/<\/li>\
<\/\1>/g
s/><p/>\
<p/g
s/><\/div>/>\
<\/div>/g
s/><div/>\
<div/g
' < .tmp.txt > out.txt
# The following section helps in knowing what still needs to be changed.
# div.mug shall stay. Many divs can be easily made into div.mugs, but there are the weird divs out there.
if [ ! $(grep -o "<div" out.txt | wc -l) -eq $(grep -o '<div class="mug"' out.txt | wc -l) ]
then
printf '\e[1;33mWARN\e[0m: This file has divs that are not div class="mug".\n'
fi
# This works, even though it sucks?
if [ ! $(grep -o "<li" out.txt | wc -l) -eq $(grep -o '<div class="rmug"' out.txt | wc -l) ]
then
printf '\e[1;33mWARN\e[0m: This file has lists.\n'
fi
if grep --quiet -e '•' -e '\u2022' -e '■' -e '\u25A0' -e '*' -e '\u2022' out.txt
then
printf '\e[1;33mWARN\e[0m: This file has manual bullets.\n'
fi
if grep --quiet ">1." out.txt
then
printf '\e[1;33mWARN\e[0m: This file has manual ordered lists.\n'
fi
# <table>s shall not pass
if grep --quiet "<table" out.txt
then
printf '\e[0;31mERR\e[0m: This file has tables.\n'
fi
# <table>s shall not pass
if grep --quiet "file:///" out.txt
then
printf '\e[0;31mERR\e[0m: This file links to files on a desktop computer.\n'
printf ' images/stories/badges/end_mark.jpg if you need it.\n'
fi
# this section is kinda messy.
if grep --quiet "<strong><em><img" out.txt
then
printf '\e[1;33mWARN\e[0m: This file has strong, emphatic images.\n'
elif grep --quiet "<em><img" out.txt
then
printf '\e[1;33mWARN\e[0m: This file has emphatic images.\n'
fi
if grep --quiet "<em><strong><img" out.txt
then
printf '\e[1;33mWARN\e[0m: This file has emphatic, strong images.\n'
elif grep --quiet "<strong><img" out.txt
then
printf '\e[1;33mWARN\e[0m: This file has strong images.\n'
fi
# I don't know why this happened, but it happens.
if grep --quiet "<strong><strong>" out.txt
then
printf '\e[1;33mWARN\e[0m: This file has nested <strong> tags.\n'
fi
# sometimes a mug is not in the correct folder, but this susses out the ones that are, and guesses on images with common mugshot narrow widths.
# earlier mug images have been filed in the past with the story photos instead of in the mugs folder
if grep --quiet '/mugs/' out.txt
then
printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
elif grep --quiet 'width="84"' out.txt
then
printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
elif grep --quiet 'width="80"' out.txt
then
printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
fi
# Switched to a more conservative style detection and removal rule, to prevent it from eating lots of stuff.
# The detection here is problematic, but fails safe and sends the warning anyways.
if [ $(grep -o 'style="' out.txt | wc -l) -ne $(grep -o 'width="8' out.txt | wc -l) ] && \
[ $(grep -o 'style="' out.txt | wc -l) -ne $(grep -o '/mugs/' in.txt | wc -l) ]
then
printf '\e[0;31mERR\e[0m: This file'"'"'s style="" directives were not removed.\n'
fi
if [ ! $(grep -o "<img" out.txt | wc -l) -eq $(grep -o '<img' in.txt | wc -l) ]
then
printf '\e[0;31mERR\e[0m: Number of images going in and out does not match.\n'
fi
if ! grep --quiet "system-readmore" out.txt
then
printf '\e[1;33mWARN\e[0m: Did this file have a <hr id="system-readmore" /> going in?.\n'
fi
# Because sometimes they aren't removed, if the tag is structured differently than my assumption.
if grep --quiet 'onmouse' out.txt
then
printf '\e[0;31mERR\e[0m: This file'"'"'s onmouseover images were not converted.\n'
fi
if grep --quiet "<span cl" out.txt
then
printf '\e[1;33mWARN\e[0m: Some <span> tags were not removed.\n'
fi
@benlk
Copy link
Author

benlk commented Feb 11, 2015

@sflynn850
Copy link

It's too true for comfort D:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment