Skip to content

Instantly share code, notes, and snippets.

@rarous
Forked from steida/gist:593816
Created September 23, 2010 15:40
Show Gist options
  • Save rarous/593822 to your computer and use it in GitHub Desktop.
Save rarous/593822 to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using Matters.Helpers;
using HtmlAgilityPack;
namespace Matters.Helpers
{
/*
* remove blacklisted
* sanitize whitelisted sttributes
* inore textnodes
* unwrap others
* map tags
* normalize nested hierarchies
* normalize headings
*/
public static class HtmlSanitizer
{
static readonly IDictionary<string, string[]> whitelist = new Dictionary<string, string[]>
{
{ "h1", null },
{ "h2", null },
{ "h3", null },
{ "h4", null },
{ "h5", null },
{ "h6", null },
{ "a", new[] { "href" } },
{ "img", new[] { "src", "title" } },
{ "strong", null },
{ "b", null },
{ "em", null },
{ "i", null },
{ "p", null },
{ "blockquote", null },
{ "ul", null },
{ "ol", null },
{ "li", null },
{ "div", null },
{ "sub", null },
{ "sup", null },
{ "table", null },
{ "tr", null },
{ "td", null },
{ "th", null }
};
static readonly string[] blacklist = new string[] {
"script"
};
static readonly IDictionary<string, string> map = new Dictionary<string, string>
{
{ "strong", "b"},
{ "em", "i"},
{ "div", "p"},
{ "h3", "h2"},
{ "h4", "h2"},
{ "h5", "h2"},
{ "h6", "h2"}
};
public static string Sanitize(this string str)
{
var doc = new HtmlDocument();
doc.OptionFixNestedTags = true;
doc.LoadHtml(str);
Sanitize(doc);
return doc.DocumentNode.WriteTo().Trim();
}
static void Sanitize(HtmlDocument doc)
{
ProcessListed(doc);
UnwrapDoubleNested(doc);
//doc.DocumentNode.
}
static void ProcessListed(HtmlDocument doc)
{
foreach (var node in GetAll(doc))
{
switch (node.NodeType)
{
case HtmlNodeType.Element:
if (whitelist.ContainsKey(node.Name))
{
node.SanitizeAttributes();
if (map.ContainsKey(node.Name))
node.ChangeTag(map[node.Name]);
}
else if (blacklist.Contains(node.Name))
node.Remove();
else
node.Unwrap();
break;
case HtmlNodeType.Text:
continue;
default:
node.Remove();
break;
}
}
}
static void UnwrapDoubleNested(HtmlDocument doc)
{
while (true)
{
var nested = GetAll(doc).FirstOrDefault(n =>
n.ParentNode != null &&
n.ParentNode.Name == n.Name
);
if (nested == null)
break;
nested.Unwrap();
};
}
static IEnumerable<HtmlNode> GetAll(HtmlDocument doc)
{
return doc.DocumentNode.ChildNodes.Flatten(ch => ch.ChildNodes).ToArray();
}
public static void SanitizeAttributes(this HtmlNode node)
{
if (node.HasAttributes)
{
var allowed = whitelist[node.Name];
for (var i = node.Attributes.Count - 1; i >= 0; i--)
{
var attribute = node.Attributes[i];
if (allowed == null || !allowed.Contains(attribute.Name))
node.Attributes.Remove(attribute);
}
}
}
public static void ChangeTag(this HtmlNode node, string tagName)
{
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node)
// System.InvalidProgramException: Unexpected error.
var clone = node.OwnerDocument.CreateElement(tagName);
foreach (var child in node.ChildNodes)
clone.AppendChild(child);
node.ParentNode.ReplaceChild(clone, node);
}
public static void Unwrap(this HtmlNode node)
{
// clone because this doesnt work: node.ParentNode.InsertBefore(node.FirstChild, node)
// System.InvalidProgramException: Unexpected error.
var clone = node.Clone();
foreach (var child in clone.ChildNodes)
node.ParentNode.InsertBefore(child, node);
node.Remove();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment