benlk/cleanup.sh Secret

## cleanup.sh
#!/bin/bash
#
# Usage:
# 	cleanup.sh
# This shell script uses sed and regular expressions to clean up a lot of HTML in posts
# This script creates a .tmp.txt that will be often overwritten.
#
# Revision 27

# Checks for existence of in.txt, the necessary input file
if [ ! -f "in.txt" ]
then
  	printf '\e[0;31mERR\e[0m:  "in.txt" does not exist \n'
  	printf '\e[0;31mERR\e[0m:  Please save your HTML in in.txt \n'
  	exit 1
fi

# Remove all the newlines, to make cleanup easier.
tr -d '[\b\f\n\r\t\v\a]' < in.txt > .tmp.txt

# Comments for this section are now inline.
# Newer versions of sed should be okay with the commented lines
# In the event your version of sed does not like this, remove all lines starting with a #
sed -Ee '
#	replace non-breaking spaces with spaces
s/ / /g
#
# Get rid of other whitespace
s/>[ \s]</></g
#
#	get rid of a classes in this blacklist
s/ class="[Aa]uthor[A-Za-z0-9\ \-]?"//g
s/ class="[Aa]uthor(-[Nn]ame)? para-style-override-[0-9]+"//g
s/ class="[Bb]ody-([A-Za-z0-9\-]*)?( para-style-override-[0-9]+)?"//g
s/ class="[Cc]aption"//g
s/ class="[Cc]itation-style"//g
s/ class="[Cc]aption para-style-override-[0-9]+"//g
s/ class="[Cc]har-style-override-[0-9]+"//g
s/ class="[Hh]eadline"//g
s/ class="[Ss]idebar-body-text"//g
s/ class="[Ss]tory"//g
# also get rid of all p class=""
s/p class="[^"]+"/p/g
#
# 	get rid of alignments
s/ align="[A-Za-z]+"//g
#
# 	get rid of the spans
s/<span class="[a-zA-Z0-9\ \-]+">//g
s/<span class="Apple-style-span">//g
s/<span style="[^"]+"[ ]?>//g
s/<span[ ]?>//g
s/<\/span>//g
#
#	no more styles
s/ style="[^"]+"//g
#
#	remove some more nbsp chars
s/&nbsp;&nbsp;/\&nbsp;/g
s/&nbsp;<\/p>/<\/p>/g
s/&nbsp;<br \/>/<br \/>/g
s/(&nbsp;( )?){3,}( )?//g
s/<p>(&nbsp;)+/<p>/g
s/<p><em>(&nbsp;)+<\/em><\/p>//g
s/<p><([a-zA-Z][a-zA-Z0-9]*)>(&nbsp;)+<\/\1>/<p>/g
s/<div( class="[a-z]+")?>(&nbsp;)+<\/div>//g
s/<br \/>(&nbsp;)?<br \/><\/li>/<\/li>/g
s/<br \/>(&nbsp;)?( )?<br \/>/<\/p><p>/g
#
#	Remove some superfluous elements
s/<br data-mce-bogus="1" \/>//g
s/<p><\/p>//g
s/<p> <\/p>//g
s/<p> <\/p>//g
s/<p><br \/>/<p>/g
s/®//g
s/<em><br \/><\/em>/<br \/>/g
s/<em><\/em>//g
s/<\/em><em>//g
s/<em>[ ]?<\/em>//g
s/<strong><\/strong>//g
s/<\/strong><strong>//g
s/<em><strong><br[ ]?\/><\/strong><\/em>//g
s/<strong><br \/><\/strong>//g
s/<div class="[^m][^>]+>[ ]?[&nbsp;]?<\/div>//g
s/<div><\/div>//g
s/<div>&nbsp;<\/div>//g
s/<p><strong>&nbsp;<\/strong><\/p>//g
s/<sup><\/sup>//g
s/<p><strong><strong>(<img[^>]+>)<\/strong>([A-Za-z0-9])/<p>\1<\/p><p><strong>\2/g
#
#	Remove mug tables? Does not work.
#	http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/
# s/<table[^>]+><tbody><tr><td[^>]+>(<img [^>]+>)<\/td><td>&nbsp;<\/td><\/tr><tr><td>(<p[a-zA-Z0-9_\/<>"\:\@\.\;\&\ ]+)<\/td><\/tr><\/tbody><\/table>/<p>\1<\/p>\2/g
# \1 image
# \2 paragraph
#
#	Shuffle elements around
s/<br[ ]?\/><\/a>/<\/a><br \/>/g
s/<br[ ]?\/><\/strong>/<\/strong><br \/>/g
s/<\/strong><br[ ]?\/>/<\/strong><\/p><p>/g
s/<strong><\/p><p>/<\/p><p><strong>/g
#s/\"\ \/>([A-Za-z0-9\u2013\u2014\u2018\u2019'"'"'"’]+)/" \/><\/p><p>\1/g
s/" \/>[ ]?([^ -~]?[A-Za-z0-9]+)/" \/><\/p><p>\1/
# p strong img /p p -> p img /p p strong, see "an alternative to laziness" http://www.regular-expressions.info/repeat.html
s/(<p>)<strong>(<img [^>]+><\/p><p>)/\1\2<strong>/g
s/<hr id="system-readmore" \/><\/p>/<\/p><hr id="system-readmore" \/>/g
s/<a ([^>]+)><br[ ]?\/>/<br \/><a \1>/g
s/(<p><a [^>]+><img [^>]+><\/a>)([A-Za-z0-9])/\1<\/p><p>\2/g
#
#	all the mouseover images
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
s/ src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';"/ src="\4" alt="\2" width="575" height="auto"/g
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)"/ src="\1" alt="\4" width="575" height="auto"/g
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height=" " src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height="[a-zA-Z0-9\ ]+" width="[a-zA-Z0-9\ ]+" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
s/<img style="[a-zA-Z0-9:;#'"'"'\.\ \-]+" /<img /g
#
#	fix image and iframe widths
s/width="5.." height="..."/width="100%" height="auto"/g
s/height="..." width="5.."/width="100%" height="auto"/g
s/width="385" height="([0-9auto]+)"/width="100%" height="auto"/g
s/height="([0-9auto]+)" width="385"/width="100%" height="auto"/g
s/<iframe width="100%" height="auto"/<iframe width="100%" height="323"/
#
#	fix images that are inside <p>
s/<p><img ([^>]+)>([^<]+)/<p><img \1><\/p><p>\2/g
#	and again, with strong and em!
s/<p><strong><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><strong>\2/g
s/<p><em><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><em>\2/g
#	and now no more strong images! (when they are in their own p)
s/<p><strong><img( [^>]+)><\/strong><\/p>/<p><img\1><\/p>/g
#	and now no more emphatic images! (when they are in their own p)
s/<p><em><img( [^>]+)><\/em><\/p>/<p><img\1><\/p>/g
#
#	change alt text when it fits a certain pattern
#	to give the name of the person
s/alt="00 ([a-z]+) ([a-z]+)( [a-z]+)?"/alt="\2 \1"/g
s/alt="00_([a-z]+)_([a-z]+)(_[a-z]+)?"/alt="\2 \1"/g
#	To give table or figure titles
s/alt="([a-zA-Z0-9_\ ]+)(tb[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g
s/alt="([a-zA-Z0-9_\ ]+)(tbl[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g
s/alt="([a-zA-Z0-9_\ ]+)(fg[_\ ]?)([0-9])"/alt="Figure \3: \1\2\3"/g
#
#	Re-add mugshot image style
s/ src="images\/stories\/mugs\// style="margin-right: 10px; float: left;" src="images\/stories\/mugs\//g
#
#	Fix a certain broken end_mark.jpg link
s/%7Eprobeef\/images\/stories\/badges\/end_mark.jpg/images\/stories\/badges\/end_mark.jpg/g
#	And the other one
s/file:\/\/\/Users\/judyhall\/Desktop\/[^/]+\/Beef_head_opt.jpeg/images\/stories\/badges\/end_mark.jpg/
s/alt="Beef_head.ai"/alt="end mark"/
#
#	Make H2 tags exist (partially undone in case of photos)
#	Currently does not work with ’ char inserted, but should be taken care of with unicode
s/<p><strong>([^<]+)<\/strong>[:]?<br \/>/<h2>\1<\/h2><p>/g
s/<p><strong>([^<]+)<br \/><\/strong>/<h2>\1<\/h2><p>/g
s/<p><strong>([^<]+)<\/strong><br \/>/<h2>\1<\/h2><p>/g
s/<p><strong>([^<]+)<\/strong><\/p>/<h2>\1<\/h2>/g
#
#	Fix h2 in mugshots
s/(<p><img[^>]+><\/p>)<h2>([^<]+)<\/h2><p>([^<]+<br \/>)/\1<p><strong>\2<\/strong><br \/>\3/g
#
#	grammar teachers hate this one simple trick
s/\.  /\. /g
s/  / /g
s/p><em>-- /p><em>\&mdash;/g
#
#	Standards:
#		callto: is deprecated. See https://tools.ietf.org/html/rfc3966#section-7.3
s/href="callto:/href="tel:/g
s/href="call to[ ]?/href="tel:/g
#
#	cleanup of photo headlines
s/([A-Z\ ]+) PHOTO(&nbsp;)?/\1/g
s/([A-Z\ ]+) RIGHT(&nbsp;)?:/\1:/g
s/([A-Za-z\ ]+) Right(&nbsp;)?:/\1:/g
s/([A-Za-z\ ]+) right(&nbsp;)?:/\1:/g
s/([A-Z]+) RIGHT ([A-Z]+)(&nbsp;)?:/\1 \2:/g
s/([A-Za-z]+) Right ([A-Za-z]+)(&nbsp;)?:/\1 \2:/g
s/([A-Za-z]+) right ([A-Za-z]+)(&nbsp;)?:/\1 \2:/g
s/([A-Z\ ]+) LEFT(&nbsp;)?:/\1:/g
s/([A-Za-z\ ]+) Left(&nbsp;)?:/\1:/g
s/([A-Z]+) MIDDLE(&nbsp;)?\:/MIDDLE \1\:/g
s/PHTOT /PHOTO /g
s/<h2>PHOTO([S]?)(&nbsp;)?[:]?[ ]?(&nbsp;)?<\/h2>/<p><strong>PHOTO\1<\/strong><\/p>/g
s/<h2>Photo(&nbsp;)?[:]?[ ]?<\/h2>/<p><strong>PHOTO<\/strong><\/p>/g
s/<h2>Photos(&nbsp;)?[:]?[ ]?<\/h2>/<p><strong>PHOTOS<\/strong><\/p>/g
#
#	Re-adding whitespace and newlines to the story
s/<br \/>/<br \/>\
/g
s/<\/([a-zA-Z][A-Za-z0-9]?)><([a-zA-Z][A-Za-z0-9]?)/<\/\1>\
<\2/g
s/ \/><([a-zA-Z][A-Za-z0-9]?)/ \/>\
<\1/g
s/<([ou]l)><li>/<\1>\
<li>/g
s/<\/li><\/([ou]l)>/<\/li>\
<\/\1>/g
s/><p/>\
<p/g
s/><\/div>/>\
<\/div>/g
s/><div/>\
<div/g
' < .tmp.txt > out.txt

# The following section helps in knowing what still needs to be changed.

# div.mug shall stay. Many divs can be easily made into div.mugs, but there are the weird divs out there.
if [ ! $(grep -o "<div" out.txt | wc -l) -eq $(grep -o '<div class="mug"' out.txt | wc -l) ]
	then
		printf '\e[1;33mWARN\e[0m: This file has divs that are not div class="mug".\n'
fi

# This works, even though it sucks?
if [ ! $(grep -o "<li" out.txt | wc -l) -eq $(grep -o '<div class="rmug"' out.txt | wc -l) ]
	then
		printf '\e[1;33mWARN\e[0m: This file has lists.\n'
fi
if grep --quiet -e '•' -e '\u2022' -e '■' -e '\u25A0' -e '*' -e '\u2022' out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has manual bullets.\n'
fi
if grep --quiet ">1." out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has manual ordered lists.\n'
fi

# <table>s shall not pass
if grep --quiet "<table" out.txt
	then
		printf '\e[0;31mERR\e[0m:  This file has tables.\n'
fi

# <table>s shall not pass
if grep --quiet "file:///" out.txt
	then
		printf '\e[0;31mERR\e[0m:  This file links to files on a desktop computer.\n'
		printf '      images/stories/badges/end_mark.jpg if you need it.\n'
fi

# this section is kinda messy.
if grep --quiet "<strong><em><img" out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has strong, emphatic images.\n'
elif grep --quiet "<em><img" out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has emphatic images.\n'
fi
if grep --quiet "<em><strong><img" out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has emphatic, strong images.\n'
elif grep --quiet "<strong><img" out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has strong images.\n'
fi

# I don't know why this happened, but it happens.
if grep --quiet "<strong><strong>" out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file has nested <strong> tags.\n'
fi

# sometimes a mug is not in the correct folder, but this susses out the ones that are, and guesses on images with common mugshot narrow widths.
# earlier mug images have been filed in the past with the story photos instead of in the mugs folder
if grep --quiet '/mugs/' out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
elif grep --quiet 'width="84"' out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
elif grep --quiet 'width="80"' out.txt
	then
		printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
fi

# Switched to a more conservative style detection and removal rule, to prevent it from eating lots of stuff.
# The detection here is problematic, but fails safe and sends the warning anyways.
if [ $(grep -o 'style="' out.txt | wc -l) -ne $(grep -o 'width="8' out.txt | wc -l) ] && \
	[ $(grep -o 'style="' out.txt | wc -l) -ne $(grep -o '/mugs/' in.txt | wc -l) ]
	then
		printf '\e[0;31mERR\e[0m:  This file'"'"'s style="" directives were not removed.\n'
fi
if [ ! $(grep -o "<img" out.txt | wc -l) -eq $(grep -o '<img' in.txt | wc -l) ]
	then
		printf '\e[0;31mERR\e[0m:  Number of images going in and out does not match.\n'
fi
if ! grep --quiet "system-readmore" out.txt
	then
		printf '\e[1;33mWARN\e[0m: Did this file have a <hr id="system-readmore" /> going in?.\n'
fi

# Because sometimes they aren't removed, if the tag is structured differently than my assumption.
if grep --quiet 'onmouse' out.txt
	then
		printf '\e[0;31mERR\e[0m:  This file'"'"'s onmouseover images were not converted.\n'
fi
if grep --quiet "<span cl" out.txt
	then
		printf '\e[1;33mWARN\e[0m: Some <span> tags were not removed.\n'
fi
	#!/bin/bash
	#
	# Usage:
	# cleanup.sh
	# This shell script uses sed and regular expressions to clean up a lot of HTML in posts
	# This script creates a .tmp.txt that will be often overwritten.
	#
	# Revision 27

	# Checks for existence of in.txt, the necessary input file
	if [ ! -f "in.txt" ]
	then
	printf '\e[0;31mERR\e[0m: "in.txt" does not exist \n'
	printf '\e[0;31mERR\e[0m: Please save your HTML in in.txt \n'
	exit 1
	fi

	# Remove all the newlines, to make cleanup easier.
	tr -d '[\b\f\n\r\t\v\a]' < in.txt > .tmp.txt

	# Comments for this section are now inline.
	# Newer versions of sed should be okay with the commented lines
	# In the event your version of sed does not like this, remove all lines starting with a #
	sed -Ee '
	# replace non-breaking spaces with spaces
	s/ / /g
	#
	# Get rid of other whitespace
	s/>[ \s]</></g
	#
	# get rid of a classes in this blacklist
	s/ class="[Aa]uthor[A-Za-z0-9\ \-]?"//g
	s/ class="[Aa]uthor(-[Nn]ame)? para-style-override-[0-9]+"//g
	s/ class="[Bb]ody-([A-Za-z0-9\-]*)?( para-style-override-[0-9]+)?"//g
	s/ class="[Cc]aption"//g
	s/ class="[Cc]itation-style"//g
	s/ class="[Cc]aption para-style-override-[0-9]+"//g
	s/ class="[Cc]har-style-override-[0-9]+"//g
	s/ class="[Hh]eadline"//g
	s/ class="[Ss]idebar-body-text"//g
	s/ class="[Ss]tory"//g
	# also get rid of all p class=""
	s/p class="[^"]+"/p/g
	#
	# get rid of alignments
	s/ align="[A-Za-z]+"//g
	#
	# get rid of the spans
	s/<span class="[a-zA-Z0-9\ \-]+">//g
	s/<span class="Apple-style-span">//g
	s/<span style="[^"]+"[ ]?>//g
	s/<span[ ]?>//g
	s/<\/span>//g
	#
	# no more styles
	s/ style="[^"]+"//g
	#
	# remove some more nbsp chars
	s/  /\ /g
	s/ <\/p>/<\/p>/g
	s/ <br \/>/<br \/>/g
	s/( ( )?){3,}( )?//g
	s/<p>( )+/<p>/g
	s/<p><em>( )+<\/em><\/p>//g
	s/<p><([a-zA-Z][a-zA-Z0-9]*)>( )+<\/\1>/<p>/g
	s/<div( class="[a-z]+")?>( )+<\/div>//g
	s/<br \/>( )?<br \/><\/li>/<\/li>/g
	s/<br \/>( )?( )?<br \/>/<\/p><p>/g
	#
	# Remove some superfluous elements
	s/<br data-mce-bogus="1" \/>//g
	s/<p><\/p>//g
	s/<p> <\/p>//g
	s/<p> <\/p>//g
	s/<p><br \/>/<p>/g
	s/®//g
	s/<em><br \/><\/em>/<br \/>/g
	s/<em><\/em>//g
	s/<\/em><em>//g
	s/<em>[ ]?<\/em>//g
	s/<strong><\/strong>//g
	s/<\/strong><strong>//g
	s/<em><strong><br[ ]?\/><\/strong><\/em>//g
	s/<strong><br \/><\/strong>//g
	s/<div class="[^m][^>]+>[ ]?[ ]?<\/div>//g
	s/<div><\/div>//g
	s/<div> <\/div>//g
	s/<p><strong> <\/strong><\/p>//g
	s/<sup><\/sup>//g
	s/<p><strong><strong>(<img[^>]+>)<\/strong>([A-Za-z0-9])/<p>\1<\/p><p><strong>\2/g
	#
	# Remove mug tables? Does not work.
	# http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/
	# s/<table[^>]+><tbody><tr><td[^>]+>(<img [^>]+>)<\/td><td> <\/td><\/tr><tr><td>(<p[a-zA-Z0-9_\/<>"\:\@\.\;\&\ ]+)<\/td><\/tr><\/tbody><\/table>/<p>\1<\/p>\2/g
	# \1 image
	# \2 paragraph
	#
	# Shuffle elements around
	s/<br[ ]?\/><\/a>/<\/a><br \/>/g
	s/<br[ ]?\/><\/strong>/<\/strong><br \/>/g
	s/<\/strong><br[ ]?\/>/<\/strong><\/p><p>/g
	s/<strong><\/p><p>/<\/p><p><strong>/g
	#s/\"\ \/>([A-Za-z0-9\u2013\u2014\u2018\u2019'"'"'"’]+)/" \/><\/p><p>\1/g
	s/" \/>[ ]?([^ -~]?[A-Za-z0-9]+)/" \/><\/p><p>\1/
	# p strong img /p p -> p img /p p strong, see "an alternative to laziness" http://www.regular-expressions.info/repeat.html
	s/(<p>)<strong>(<img [^>]+><\/p><p>)/\1\2<strong>/g
	s/<hr id="system-readmore" \/><\/p>/<\/p><hr id="system-readmore" \/>/g
	s/<a ([^>]+)><br[ ]?\/>/<br \/><a \1>/g
	s/(<p><a [^>]+><img [^>]+><\/a>)([A-Za-z0-9])/\1<\/p><p>\2/g
	#
	# all the mouseover images
	s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
	s/ src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';"/ src="\4" alt="\2" width="575" height="auto"/g
	s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)" alt="([0-9a-zA-Z\-\ _\.]+)"/ src="\1" alt="\4" width="575" height="auto"/g
	s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height=" " src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
	s/ onmouseover="this.src='"'"'([a-zA-Z0-9\-\._/]+\.[efgijnp]+)'"'"';" onmouseout="([a-zA-Z0-9\-\._/"='"'"';]+)" alt="([0-9a-zA-Z\-\ _\.]+)" height="[a-zA-Z0-9\ ]+" width="[a-zA-Z0-9\ ]+" src="([a-zA-Z0-9\-\._/]+\.[efgijnp]+)"/ src="\1" alt="\3" width="575" height="auto"/g
	s/<img style="[a-zA-Z0-9:;#'"'"'\.\ \-]+" /<img /g
	#
	# fix image and iframe widths
	s/width="5.." height="..."/width="100%" height="auto"/g
	s/height="..." width="5.."/width="100%" height="auto"/g
	s/width="385" height="([0-9auto]+)"/width="100%" height="auto"/g
	s/height="([0-9auto]+)" width="385"/width="100%" height="auto"/g
	s/<iframe width="100%" height="auto"/<iframe width="100%" height="323"/
	#
	# fix images that are inside <p>
	s/<p><img ([^>]+)>([^<]+)/<p><img \1><\/p><p>\2/g
	# and again, with strong and em!
	s/<p><strong><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><strong>\2/g
	s/<p><em><img ([^>]+)>([^<]+)/<p><img \1><\/p><p><em>\2/g
	# and now no more strong images! (when they are in their own p)
	s/<p><strong><img( [^>]+)><\/strong><\/p>/<p><img\1><\/p>/g
	# and now no more emphatic images! (when they are in their own p)
	s/<p><em><img( [^>]+)><\/em><\/p>/<p><img\1><\/p>/g
	#
	# change alt text when it fits a certain pattern
	# to give the name of the person
	s/alt="00 ([a-z]+) ([a-z]+)( [a-z]+)?"/alt="\2 \1"/g
	s/alt="00_([a-z]+)_([a-z]+)(_[a-z]+)?"/alt="\2 \1"/g
	# To give table or figure titles
	s/alt="([a-zA-Z0-9_\ ]+)(tb[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g
	s/alt="([a-zA-Z0-9_\ ]+)(tbl[_\ ]?)([0-9])"/alt="Table \3: \1\2\3"/g
	s/alt="([a-zA-Z0-9_\ ]+)(fg[_\ ]?)([0-9])"/alt="Figure \3: \1\2\3"/g
	#
	# Re-add mugshot image style
	s/ src="images\/stories\/mugs\// style="margin-right: 10px; float: left;" src="images\/stories\/mugs\//g
	#
	# Fix a certain broken end_mark.jpg link
	s/%7Eprobeef\/images\/stories\/badges\/end_mark.jpg/images\/stories\/badges\/end_mark.jpg/g
	# And the other one
	s/file:\/\/\/Users\/judyhall\/Desktop\/[^/]+\/Beef_head_opt.jpeg/images\/stories\/badges\/end_mark.jpg/
	s/alt="Beef_head.ai"/alt="end mark"/
	#
	# Make H2 tags exist (partially undone in case of photos)
	# Currently does not work with ’ char inserted, but should be taken care of with unicode
	s/<p><strong>([^<]+)<\/strong>[:]?<br \/>/<h2>\1<\/h2><p>/g
	s/<p><strong>([^<]+)<br \/><\/strong>/<h2>\1<\/h2><p>/g
	s/<p><strong>([^<]+)<\/strong><br \/>/<h2>\1<\/h2><p>/g
	s/<p><strong>([^<]+)<\/strong><\/p>/<h2>\1<\/h2>/g
	#
	# Fix h2 in mugshots
	s/(<p><img[^>]+><\/p>)<h2>([^<]+)<\/h2><p>([^<]+<br \/>)/\1<p><strong>\2<\/strong><br \/>\3/g
	#
	# grammar teachers hate this one simple trick
	s/\. /\. /g
	s/ / /g
	s/p><em>-- /p><em>\—/g
	#
	# Standards:
	# callto: is deprecated. See https://tools.ietf.org/html/rfc3966#section-7.3
	s/href="callto:/href="tel:/g
	s/href="call to[ ]?/href="tel:/g
	#
	# cleanup of photo headlines
	s/([A-Z\ ]+) PHOTO( )?/\1/g
	s/([A-Z\ ]+) RIGHT( )?:/\1:/g
	s/([A-Za-z\ ]+) Right( )?:/\1:/g
	s/([A-Za-z\ ]+) right( )?:/\1:/g
	s/([A-Z]+) RIGHT ([A-Z]+)( )?:/\1 \2:/g
	s/([A-Za-z]+) Right ([A-Za-z]+)( )?:/\1 \2:/g
	s/([A-Za-z]+) right ([A-Za-z]+)( )?:/\1 \2:/g
	s/([A-Z\ ]+) LEFT( )?:/\1:/g
	s/([A-Za-z\ ]+) Left( )?:/\1:/g
	s/([A-Z]+) MIDDLE( )?\:/MIDDLE \1\:/g
	s/PHTOT /PHOTO /g
	s/<h2>PHOTO([S]?)( )?[:]?[ ]?( )?<\/h2>/<p><strong>PHOTO\1<\/strong><\/p>/g
	s/<h2>Photo( )?[:]?[ ]?<\/h2>/<p><strong>PHOTO<\/strong><\/p>/g
	s/<h2>Photos( )?[:]?[ ]?<\/h2>/<p><strong>PHOTOS<\/strong><\/p>/g
	#
	# Re-adding whitespace and newlines to the story
	s/<br \/>/<br \/>\
	/g
	s/<\/([a-zA-Z][A-Za-z0-9]?)><([a-zA-Z][A-Za-z0-9]?)/<\/\1>\
	<\2/g
	s/ \/><([a-zA-Z][A-Za-z0-9]?)/ \/>\
	<\1/g
	s/<([ou]l)><li>/<\1>\
	<li>/g
	s/<\/li><\/([ou]l)>/<\/li>\
	<\/\1>/g
	s/><p/>\
	<p/g
	s/><\/div>/>\
	<\/div>/g
	s/><div/>\
	<div/g
	' < .tmp.txt > out.txt

	# The following section helps in knowing what still needs to be changed.

	# div.mug shall stay. Many divs can be easily made into div.mugs, but there are the weird divs out there.
	if [ ! $(grep -o "<div" out.txt \| wc -l) -eq $(grep -o '<div class="mug"' out.txt \| wc -l) ]
	then
	printf '\e[1;33mWARN\e[0m: This file has divs that are not div class="mug".\n'
	fi

	# This works, even though it sucks?
	if [ ! $(grep -o "<li" out.txt \| wc -l) -eq $(grep -o '<div class="rmug"' out.txt \| wc -l) ]
	then
	printf '\e[1;33mWARN\e[0m: This file has lists.\n'
	fi
	if grep --quiet -e '•' -e '\u2022' -e '■' -e '\u25A0' -e '*' -e '\u2022' out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has manual bullets.\n'
	fi
	if grep --quiet ">1." out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has manual ordered lists.\n'
	fi

	# <table>s shall not pass
	if grep --quiet "<table" out.txt
	then
	printf '\e[0;31mERR\e[0m: This file has tables.\n'
	fi

	# <table>s shall not pass
	if grep --quiet "file:///" out.txt
	then
	printf '\e[0;31mERR\e[0m: This file links to files on a desktop computer.\n'
	printf ' images/stories/badges/end_mark.jpg if you need it.\n'
	fi

	# this section is kinda messy.
	if grep --quiet "<strong><em><img" out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has strong, emphatic images.\n'
	elif grep --quiet "<em><img" out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has emphatic images.\n'
	fi
	if grep --quiet "<em><strong><img" out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has emphatic, strong images.\n'
	elif grep --quiet "<strong><img" out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has strong images.\n'
	fi

	# I don't know why this happened, but it happens.
	if grep --quiet "<strong><strong>" out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file has nested <strong> tags.\n'
	fi

	# sometimes a mug is not in the correct folder, but this susses out the ones that are, and guesses on images with common mugshot narrow widths.
	# earlier mug images have been filed in the past with the story photos instead of in the mugs folder
	if grep --quiet '/mugs/' out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
	elif grep --quiet 'width="84"' out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
	elif grep --quiet 'width="80"' out.txt
	then
	printf '\e[1;33mWARN\e[0m: This file may have mugshots. Check for missing style="" statements.\n'
	fi

	# Switched to a more conservative style detection and removal rule, to prevent it from eating lots of stuff.
	# The detection here is problematic, but fails safe and sends the warning anyways.
	if [ $(grep -o 'style="' out.txt \| wc -l) -ne $(grep -o 'width="8' out.txt \| wc -l) ] && \
	[ $(grep -o 'style="' out.txt \| wc -l) -ne $(grep -o '/mugs/' in.txt \| wc -l) ]
	then
	printf '\e[0;31mERR\e[0m: This file'"'"'s style="" directives were not removed.\n'
	fi
	if [ ! $(grep -o "<img" out.txt \| wc -l) -eq $(grep -o '<img' in.txt \| wc -l) ]
	then
	printf '\e[0;31mERR\e[0m: Number of images going in and out does not match.\n'
	fi
	if ! grep --quiet "system-readmore" out.txt
	then
	printf '\e[1;33mWARN\e[0m: Did this file have a <hr id="system-readmore" /> going in?.\n'
	fi

	# Because sometimes they aren't removed, if the tag is structured differently than my assumption.
	if grep --quiet 'onmouse' out.txt
	then
	printf '\e[0;31mERR\e[0m: This file'"'"'s onmouseover images were not converted.\n'
	fi
	if grep --quiet "<span cl" out.txt
	then
	printf '\e[1;33mWARN\e[0m: Some <span> tags were not removed.\n'
	fi