Skip to content

Instantly share code, notes, and snippets.

@steida
Created September 23, 2010 15:39
Show Gist options
  • Save steida/593816 to your computer and use it in GitHub Desktop.
Save steida/593816 to your computer and use it in GitHub Desktop.
using System.Collections.Generic;
using System.Linq;
using HtmlAgilityPack;
namespace TeamVision.Helpers
{
/*
* remove blacklisted
* sanitize whitelisted sttributes
* inore textnodes
* unwrap others
* map tags
* normalize nested hierarchies
* normalize headings
* wrap all blockless textnodes to paragraphs
*
* todo: test
*
* <h1>Ahoj</h1>
* <h5>do h2</h5>
* <script>alert('xxs')</script>
* <p><div>ho<b>v</b>no</div></p>
* <strong>na b</strong><em>na i</em>
* <cite>cituju</cite>
* <a style='display: none' href=google.com>foo</a>
* <scrip>alert('xxs')</scrip>
* <!--fuck-->
*
* <h1>Ahoj</h1>
* <h2>do h2</h2>
* <p>ho<b>v</b>no</p>
* <b>na b</b><i>na i</i>
* cituju
* <a href="google.com">foo</a>
* alert('xxs')
*
*/
public static class HtmlSanitizer
{
static readonly IDictionary<string, string[]> whitelist = new Dictionary<string, string[]>
{
{ "h1", null },
{ "h2", null },
{ "h3", null },
{ "h4", null },
{ "h5", null },
{ "h6", null },
{ "a", new[] { "href" } },
{ "img", new[] { "src", "title" } },
{ "strong", null },
{ "b", null },
{ "em", null },
{ "i", null },
{ "p", null },
{ "blockquote", null },
{ "ul", null },
{ "ol", null },
{ "li", null },
{ "div", null },
{ "sub", null },
{ "sup", null },
{ "table", null },
{ "tr", null },
{ "td", null },
{ "th", null }
};
static readonly string[] blacklist = new string[] {
"script"
};
static readonly IDictionary<string, string> map = new Dictionary<string, string>
{
{ "strong", "b"},
{ "em", "i"},
{ "div", "p"},
{ "h4", "h3"},
{ "h5", "h3"},
{ "h6", "h3"}
};
static readonly string[] blocks = new string[] {
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"p",
"blockquote",
"li",
"td",
"th"
};
public static string Sanitize(this string str)
{
var doc = new HtmlDocument();
doc.OptionFixNestedTags = true;
doc.LoadHtml(str);
Sanitize(doc);
return doc.DocumentNode.WriteTo().Trim();
}
static void Sanitize(HtmlDocument doc)
{
ProcessListed(doc);
UnwrapDoubleNested(doc);
WrapBlocklessTextNodes(doc);
}
static void ProcessListed(HtmlDocument doc)
{
foreach (var node in GetAll(doc))
{
switch (node.NodeType)
{
case HtmlNodeType.Element:
if (whitelist.ContainsKey(node.Name))
{
node.SanitizeAttributes();
if (map.ContainsKey(node.Name))
node.ChangeTag(map[node.Name]);
}
else if (blacklist.Contains(node.Name))
node.Remove();
else
node.Unwrap();
break;
case HtmlNodeType.Text:
continue;
default:
node.Remove();
break;
}
}
}
static void UnwrapDoubleNested(HtmlDocument doc)
{
while (true)
{
var nested = GetAll(doc).FirstOrDefault(n =>
n.ParentNode != null &&
n.ParentNode.Name == n.Name
);
if (nested == null)
break;
nested.Unwrap();
};
}
static void WrapBlocklessTextNodes(HtmlDocument doc)
{
var nodes = GetAll(doc).Where(n =>
n.NodeType == HtmlNodeType.Text &&
!n.HasBlockParent()
);
foreach (var node in nodes)
{
var p = node.OwnerDocument.CreateElement("p");
node.ParentNode.ReplaceChild(p, node);
p.AppendChild(node);
}
}
static IEnumerable<HtmlNode> GetAll(HtmlDocument doc)
{
return doc.DocumentNode.ChildNodes.Flatten(ch => ch.ChildNodes).ToArray();
}
public static void SanitizeAttributes(this HtmlNode node)
{
if (node.HasAttributes)
{
var allowed = whitelist[node.Name];
for (var i = node.Attributes.Count - 1; i >= 0; i--)
{
var attribute = node.Attributes[i];
if (allowed == null || !allowed.Contains(attribute.Name))
node.Attributes.Remove(attribute);
}
}
}
public static void ChangeTag(this HtmlNode node, string tagName)
{
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node)
// System.InvalidProgramException: Unexpected error.
var clone = node.OwnerDocument.CreateElement(tagName);
foreach (var child in node.ChildNodes)
clone.AppendChild(child);
node.ParentNode.ReplaceChild(clone, node);
}
public static void Unwrap(this HtmlNode node)
{
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node)
// System.InvalidProgramException: Unexpected error.
var clone = node.Clone();
foreach (var child in clone.ChildNodes)
node.ParentNode.InsertBefore(child, node);
node.Remove();
}
public static bool HasBlockParent(this HtmlNode node)
{
var parent = node.ParentNode;
while (parent != null) {
if (blocks.Contains(parent.Name))
return true;
parent = parent.ParentNode;
}
return false;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment