Skip to content

Instantly share code, notes, and snippets.

@plioi
Created July 11, 2013 00:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save plioi/5971498 to your computer and use it in GitHub Desktop.
Save plioi/5971498 to your computer and use it in GitHub Desktop.
Wordpress's automatic <p> tag insertion meat grinder, portedn to C#.
static string linebreaks_wp(string body)
{
if (body.Trim() == "")
return "";
//Ensure all newlines are simply \n and that we end with a \n
body = body.Replace("\r\n", "\n")
.Replace("\r", "\n");
body = body + "\n";
//Convert br-pairs into \n\n.
body = Regex.Replace(body, @"<br />\s*<br />", "\n\n");
const string allblocks = @"(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)";
//Ensure that all 'block' open tags appear at the start of a line.
body = Regex.Replace(body, @"(<" + allblocks + "[^>]*>)", m => "\n" + m.Groups[1].Value);
//Ensure that all '/block' close tags are followed by a blank line.
body = Regex.Replace(body, @"(</" + allblocks + ">)", m => m.Groups[1].Value+"\n\n");
if (body.Contains("<object")){
body = Regex.Replace(body, @"\s*<param([^>]*)>\s*", m => "<param"+m.Groups[1].Value+">");// no body inside object/embed
body = Regex.Replace(body, @"\s*</embed>\s*", "</embed>");
}
//Shrink long \n\n\n\n\n chains down to a single blank line, \n\n.
body = Regex.Replace(body, @"\n\n+", "\n\n");
//Split the whole body by blank lines.
var chunks = Regex.Split(body, @"\n\s*\n"); // since PHP has a PREG_SPLIT_NO_EMPTY, may need to go through pees and remove any empty strings.
//Optimistically surround each chunk in a <p>..</p>.
body = String.Join("", chunks.Select(chunk => "<p>"+chunk.Trim('\n')+"</p>\n"));
//Clean away all-whitespace paragraphs.
body = Regex.Replace(body, @"<p>\s*</p>", "");
// For a <p> that precedes [no tags] followed by a closing div/address/form tag,
// close the paragraph before closing teh div/address/form.
body = Regex.Replace(body, @"<p>([^<]+)</(div|address|form)>",
m => "<p>" + m.Groups[1].Value + "</p></" + m.Groups[2].Value + ">");
// Remove the optimistic <p>..</p> around block tags, like <p><h2>text</h2></p> => <h2>text</h2>.
body = Regex.Replace(body, @"<p>\s*(</?" + allblocks + @"[^>]*>)\s*</p>", m => m.Groups[1].Value); // don't body all over a tag
// ??LOLWUT?? "Problem with nested lists.
body = Regex.Replace(body, @"<p>(<li.+?)</p>", m => m.Groups[1].Value);
//Optimistic <p>..</p> around a <blockquote /> should become <blockquote><p>..</p></blockquote>.
body = Regex.Replace(body, @"<p><blockquote([^>]*)>", m => "<blockquote"+m.Groups[1].Value+"><p>", RegexOptions.IgnoreCase);
body = body.Replace("</blockquote></p>", "</p></blockquote>");
//Strip optimistic <p>..<p> from bar block tags like <p><block></p> or <p></block></p> tags.
body = Regex.Replace(body, @"<p>\s*(</?" + allblocks + @"[^>]*>)", m => m.Groups[1].Value);
body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]*>)\s*</p>", m => m.Groups[1].Value);
// Attempt to preserve \n found in script and style tags.
body = Regex.Replace(body, @"<(script|style).*?</\1>", m => m.Groups[0].Value.Replace("\n", "<WPPreserveNewline />"), RegexOptions.Singleline);
// Convert [not-br-tag] [whitespace] [\n] into [br-tag] [\n]
// iow, introduce a br tag for any \n that doesn't already have a br tag.
body = Regex.Replace(body, @"(?<!<br />)\s*\n", "<br />\n"); // make line breaks
//Include any explicitly requested \n (ie introduced by _autop_newline_preservation_helper)
body = body.Replace("<WPPreserveNewline />", "\n");
//Strip any whitespace-and-br-tag that followed a <block> opener or </block> closer.
body = Regex.Replace(body, @"(</?" + allblocks + @"[^>]*>)\s*<br />", m => m.Groups[1].Value);
//Strip any br tag that precedes one of these special tags, since they already have their own linebreak behavior.
body = Regex.Replace(body, @"<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)", m=> m.Groups[1].Value);
if (body.Contains("<pre"))
{
//(?is) is trying to do case insensitive single-line rules in the subexpression,
// but I dont' think this is quite how it is done in c#.
body = Regex.Replace(body, "(?is)(<pre[^>]*>)(.*?)</pre>", clean_pre);
}
body = Regex.Replace(body, @"\n</p>$", "</p>");
return body;
}
private static string clean_pre(Match m)
{
string text;
if (m.Groups[1].Success && m.Groups[2].Success)
{
text = m.Groups[2].Value;
text = text.Replace("<br />", "");
text = text.Replace("<p>", "\n");
text = text.Replace("</p>", "");
text = m.Groups[1].Value + HttpUtility.HtmlEncode(text) + "</pre>";
}
else
{
text = m.Groups[0].Value;
text = text.Replace("<br />", "");
text = text.Replace("<p>", "\n");
text = text.Replace("</p>", "");
}
return text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment