-
-
Save benlk/0fe0c4c1d35db356b3d9 to your computer and use it in GitHub Desktop.
Script used to clean Joomla posts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Usage: | |
# cleanup.sh | |
# This shell script uses sed and regular expressions to clean up a lot of HTML in posts | |
# This script creates a .tmp.txt that will be often overwritten. | |
# | |
# Revision 27 | |
# Checks for existence of in.txt, the necessary input file | |
if [ ! -f "in.txt" ] | |
then | |
printf '\e[0;31mERR\e[0m: "in.txt" does not exist \n' | |
printf '\e[0;31mERR\e[0m: Please save your HTML in in.txt \n' | |
exit 1 | |
fi | |
# Remove all the newlines, to make cleanup easier. | |
tr -d '[\b\f\n\r\t\v\a]' < in.txt > .tmp.txt | |
# Comments for this section are now inline. | |
# Newer versions of sed should be okay with the commented lines | |
# In the event your version of sed does not like this, remove all lines starting with a # | |
sed -Ee ' | |
# replace non-breaking spaces with spaces | |
s/ / /g | |
# | |
# Get rid of other whitespace | |
s/>[ \s]</></g | |
# | |
# get rid of a classes in this blacklist | |
s/ class="[Aa]uthor[A-Za-z0-9\ \-]?"//g | |
s/ class="[Aa]uthor(-[Nn]ame)? para-style-override-[0-9]+"//g | |
s/ class="[Bb]ody-([A-Za-z0-9\-]*)?( para-style-override-[0-9]+)?"//g | |
s/ class="[Cc]aption"//g | |
s/ class="[Cc]itation-style"//g | |
s/ class="[Cc]aption para-style-override-[0-9]+"//g | |
s/ class="[Cc]har-style-override-[0-9]+"//g | |
s/ class="[Hh]eadline"//g | |
s/ class="[Ss]idebar-body-text"//g | |
s/ class="[Ss]tory"//g | |
# also get rid of all p class="" | |
s/p class="[^"]+"/p/g | |
# | |
# get rid of alignments | |
s/ align="[A-Za-z]+"//g | |
# | |
# get rid of the spans | |
s/<span class="[a-zA-Z0-9\ \-]+">//g | |
s/<span class="Apple-style-span">//g | |
s/<span style="[^"]+"[ ]?>//g | |
s/<span[ ]?>//g | |
s/<\/span>//g | |
# | |
# no more styles | |
s/ style="[^"]+"//g | |
# | |
# remove some more nbsp chars | |
s/ /\ /g | |
s/ <\/p>/<\/p>/g | |
s/ <br \/>/<br \/>/g | |
s/( ( )?){3,}( )?//g | |
s/<p>( )+/<p>/g | |
s/<p><em>( )+<\/em><\/p>//g | |
s/<p><([a-zA-Z][a-zA-Z0-9]*)>( )+<\/\1>/<p>/g | |
s/<div( class="[a-z]+")?>( )+<\/div>//g | |
s/<br \/>( )?<br \/><\/li>/<\/li>/g | |
s/<br \/>( )?( )?<br \/>/<\/p><p>/g | |
# | |
# Remove some superfluous elements | |
s/<br data-mce-bogus="1" \/>//g | |
s/<p><\/p>//g | |
s/<p> <\/p>//g | |
s/<p> <\/p>//g | |
s/<p><br \/>/<p>/g | |
s/®//g | |
s/<em><br \/><\/em>/<br \/>/g | |
s/<em><\/em>//g | |
s/<\/em><em>//g | |
s/<em>[ ]?<\/em>//g | |
s/<strong><\/strong>//g | |
s/<\/strong><strong>//g | |
s/<em><strong><br[ ]?\/><\/strong><\/em>//g | |
s/<strong><br \/><\/strong>//g | |
s/<div class="[^m][^>]+>[ ]?[ ]?<\/div>//g | |
s/<div><\/div>//g | |
s/<div> <\/div>//g | |
s/<p><strong> <\/strong><\/p>//g | |
s/<sup><\/sup>//g | |
s/<p><strong><strong>(<img[^>]+>)<\/strong>([A-Za-z0-9])/<p>\1<\/p><p><strong>\2/g | |
# | |
# Remove mug tables? Does not work. | |
# http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/ | |
# s/<table[^>]+><tbody><tr><td[^>]+>(<img [^>]+>)<\/td><td> <\/td><\/tr><tr><td>(<p[a-zA-Z0-9_\/<>"\:\@\.\;\&\ ]+)<\/td><\/tr><\/tbody><\/table>/<p>\1<\/p>\2/g | |
# \1 image | |
# \2 paragraph | |
# | |
# Shuffle elements around | |
s/<br[ ]?\/><\/a>/<\/a><br \/>/g | |
s/<br[ ]?\/><\/strong>/<\/strong><br \/>/g | |
s/<\/strong><br[ ]?\/>/<\/strong><\/p><p>/g | |
s/<strong><\/p><p>/<\/p><p><strong>/g | |
#s/\"\ \/>([A-Za-z0-9\u2013\u2014\u2018\u2019'"'"'"’]+)/" \/><\/p><p>\1/g | |
s/" \/>[ ]?([^ -~]?[A-Za-z0-9]+)/" \/><\/p><p>\1/ | |
# p strong img /p p -> p img /p p strong, see "an alternative to laziness" http://www.regular-expressions.info/repeat.html | |
s/(<p>)<strong>(<img [^>]+><\/p><p>)/\1\2<strong>/g | |
s/<hr id="system-readmore" \/><\/p>/<\/p><hr id="system-readmore" \/>/g | |
s/<a ([^>]+)><br[ ]?\/>/<br \/><a \1>/g | |
s/(<p><a [^>]+><img [^>]+><\/a>)([A-Za-z0-9])/\1<\/p><p>\2/g | |
# | |
# all the mouseover images | |
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g | |
s/ src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';"/ src="\4" alt="\2" width="575" height="auto"/g | |
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)"/ src="\1" alt="\4" width="575" height="auto"/g | |
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height=" " src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g | |
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height="[a-zA-Z0-9\ ]+" width="[a-zA-Z0-9\ ]+" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g | |
s/<img style="[a-zA-Z0-9:;#'"'"'\.\ \-]+" /<img /g | |
# | |
# fix image and iframe widths | |
s/width="5.." height="..."/width="100%" height="auto"/g | |
s/height="..." width="5.."/width="100%" height="auto"/g | |
s/width="385" height="([0-9auto]+)"/width="100%" height="auto"/g | |
s/height="([0-9auto]+)" width="385"/width="100%" height="auto"/g | |
s/<iframe width="100%" height="auto"/<iframe width="100%" height="323"/ | |
# | |
# fix images that are inside <p> | |
s/<p><img ([^>]+)>([^<]+)/<p><img \1><\/p><p>\2/g | |
# and again, with strong and em! | |
s/<p><strong><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><strong>\2/g | |
s/<p><em><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><em>\2/g | |
# and now no more strong images! (when they are in their own p) | |
s/<p><strong><img( [^>]+)><\/strong><\/p>/<p><img\1><\/p>/g | |
# and now no more emphatic images! (when they are in their own p) | |
s/<p><em><img( [^>]+)><\/em><\/p>/<p><img\1><\/p>/g | |
# | |
# change alt text when it fits a certain pattern | |
# to give the name of the person | |
s/alt="00 ([a-z]+) ([a-z]+)( [a-z]+)?"/alt="\2 \1"/g | |
s/alt="00_([a-z]+)_([a-z]+)(_[a-z]+)?"/alt="\2 \1"/g | |
# To give table or figure titles | |
s/alt="([a-zA-Z0-9_\ ]+)(tb[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g | |
s/alt="([a-zA-Z0-9_\ ]+)(tbl[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g | |
s/alt="([a-zA-Z0-9_\ ]+)(fg[_\ ]?)([0-9])"/alt="Figure \3: \1\2\3"/g | |
# | |
# Re-add mugshot image style | |
s/ src="images\/stories\/mugs\// style="margin-right: 10px; float: left;" src="images\/stories\/mugs\//g | |
# | |
# Fix a certain broken end_mark.jpg link | |
s/%7Eprobeef\/images\/stories\/badges\/end_mark.jpg/images\/stories\/badges\/end_mark.jpg/g | |
# And the other one | |
s/file:\/\/\/Users\/judyhall\/Desktop\/[^/]+\/Beef_head_opt.jpeg/images\/stories\/badges\/end_mark.jpg/ | |
s/alt="Beef_head.ai"/alt="end mark"/ | |
# | |
# Make H2 tags exist (partially undone in case of photos) | |
# Currently does not work with ’ char inserted, but should be taken care of with unicode | |
s/<p><strong>([^<]+)<\/strong>[:]?<br \/>/<h2>\1<\/h2><p>/g | |
s/<p><strong>([^<]+)<br \/><\/strong>/<h2>\1<\/h2><p>/g | |
s/<p><strong>([^<]+)<\/strong><br \/>/<h2>\1<\/h2><p>/g | |
s/<p><strong>([^<]+)<\/strong><\/p>/<h2>\1<\/h2>/g | |
# | |
# Fix h2 in mugshots | |
s/(<p><img[^>]+><\/p>)<h2>([^<]+)<\/h2><p>([^<]+<br \/>)/\1<p><strong>\2<\/strong><br \/>\3/g | |
# | |
# grammar teachers hate this one simple trick | |
s/\. /\. /g | |
s/ / /g | |
s/p><em>-- /p><em>\—/g | |
# | |
# Standards: | |
# callto: is deprecated. See https://tools.ietf.org/html/rfc3966#section-7.3 | |
s/href="callto:/href="tel:/g | |
s/href="call to[ ]?/href="tel:/g | |
# | |
# cleanup of photo headlines | |
s/([A-Z\ ]+) PHOTO( )?/\1/g | |
s/([A-Z\ ]+) RIGHT( )?:/\1:/g | |
s/([A-Za-z\ ]+) Right( )?:/\1:/g | |
s/([A-Za-z\ ]+) right( )?:/\1:/g | |
s/([A-Z]+) RIGHT ([A-Z]+)( )?:/\1 \2:/g | |
s/([A-Za-z]+) Right ([A-Za-z]+)( )?:/\1 \2:/g | |
s/([A-Za-z]+) right ([A-Za-z]+)( )?:/\1 \2:/g | |
s/([A-Z\ ]+) LEFT( )?:/\1:/g | |
s/([A-Za-z\ ]+) Left( )?:/\1:/g | |
s/([A-Z]+) MIDDLE( )?\:/MIDDLE \1\:/g | |
s/PHTOT /PHOTO /g | |
s/<h2>PHOTO([S]?)( )?[:]?[ ]?( )?<\/h2>/<p><strong>PHOTO\1<\/strong><\/p>/g | |
s/<h2>Photo( )?[:]?[ ]?<\/h2>/<p><strong>PHOTO<\/strong><\/p>/g | |
s/<h2>Photos( )?[:]?[ ]?<\/h2>/<p><strong>PHOTOS<\/strong><\/p>/g | |
# | |
# Re-adding whitespace and newlines to the story | |
s/<br \/>/<br \/>\ | |
/g | |
s/<\/([a-zA-Z][A-Za-z0-9]?)><([a-zA-Z][A-Za-z0-9]?)/<\/\1>\ | |
<\2/g | |
s/ \/><([a-zA-Z][A-Za-z0-9]?)/ \/>\ | |
<\1/g | |
s/<([ou]l)><li>/<\1>\ | |
<li>/g | |
s/<\/li><\/([ou]l)>/<\/li>\ | |
<\/\1>/g | |
s/><p/>\ | |
<p/g | |
s/><\/div>/>\ | |
<\/div>/g | |
s/><div/>\ | |
<div/g | |
' < .tmp.txt > out.txt | |
# The following section helps in knowing what still needs to be changed. | |
# div.mug shall stay. Many divs can be easily made into div.mugs, but there are the weird divs out there. | |
if [ ! $(grep -o "<div" out.txt | wc -l) -eq $(grep -o '<div class="mug"' out.txt | wc -l) ] | |
then | |
printf '\e[1;33mWARN\e[0m: This file has divs that are not div class="mug".\n' | |
fi | |
# This works, even though it sucks? | |
if [ ! $(grep -o "<li" out.txt | wc -l) -eq $(grep -o '<div class="rmug"' out.txt | wc -l) ] | |
then | |
printf '\e[1;33mWARN\e[0m: This file has lists.\n' | |
fi | |
if grep --quiet -e '•' -e '\u2022' -e '■' -e '\u25A0' -e '*' -e '\u2022' out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has manual bullets.\n' | |
fi | |
if grep --quiet ">1." out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has manual ordered lists.\n' | |
fi | |
# <table>s shall not pass | |
if grep --quiet "<table" out.txt | |
then | |
printf '\e[0;31mERR\e[0m: This file has tables.\n' | |
fi | |
# <table>s shall not pass | |
if grep --quiet "file:///" out.txt | |
then | |
printf '\e[0;31mERR\e[0m: This file links to files on a desktop computer.\n' | |
printf ' images/stories/badges/end_mark.jpg if you need it.\n' | |
fi | |
# this section is kinda messy. | |
if grep --quiet "<strong><em><img" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has strong, emphatic images.\n' | |
elif grep --quiet "<em><img" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has emphatic images.\n' | |
fi | |
if grep --quiet "<em><strong><img" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has emphatic, strong images.\n' | |
elif grep --quiet "<strong><img" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has strong images.\n' | |
fi | |
# I don't know why this happened, but it happens. | |
if grep --quiet "<strong><strong>" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file has nested <strong> tags.\n' | |
fi | |
# sometimes a mug is not in the correct folder, but this susses out the ones that are, and guesses on images with common mugshot narrow widths. | |
# earlier mug images have been filed in the past with the story photos instead of in the mugs folder | |
if grep --quiet '/mugs/' out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n' | |
elif grep --quiet 'width="84"' out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n' | |
elif grep --quiet 'width="80"' out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n' | |
fi | |
# Switched to a more conservative style detection and removal rule, to prevent it from eating lots of stuff. | |
# The detection here is problematic, but fails safe and sends the warning anyways. | |
if [ $(grep -o 'style="' out.txt | wc -l) -ne $(grep -o 'width="8' out.txt | wc -l) ] && \ | |
[ $(grep -o 'style="' out.txt | wc -l) -ne $(grep -o '/mugs/' in.txt | wc -l) ] | |
then | |
printf '\e[0;31mERR\e[0m: This file'"'"'s style="" directives were not removed.\n' | |
fi | |
if [ ! $(grep -o "<img" out.txt | wc -l) -eq $(grep -o '<img' in.txt | wc -l) ] | |
then | |
printf '\e[0;31mERR\e[0m: Number of images going in and out does not match.\n' | |
fi | |
if ! grep --quiet "system-readmore" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: Did this file have a <hr id="system-readmore" /> going in?.\n' | |
fi | |
# Because sometimes they aren't removed, if the tag is structured differently than my assumption. | |
if grep --quiet 'onmouse' out.txt | |
then | |
printf '\e[0;31mERR\e[0m: This file'"'"'s onmouseover images were not converted.\n' | |
fi | |
if grep --quiet "<span cl" out.txt | |
then | |
printf '\e[1;33mWARN\e[0m: Some <span> tags were not removed.\n' | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://threepanelsoul.com/2014/06/02/code-maintenance/
Edit, updated permalink: http://www.threepanelsoul.com/comic/code-maintenance