Skip to content

Instantly share code, notes, and snippets.

@odises
Created August 3, 2016 04:09
Show Gist options
  • Save odises/a52faa06ad0e7a978eb39eb577c34142 to your computer and use it in GitHub Desktop.
Save odises/a52faa06ad0e7a978eb39eb577c34142 to your computer and use it in GitHub Desktop.
A method that removes all tags from html except those that are on the acceptableTags array, and raw text nodes.
using System.Collections.Generic;
using System.Linq;
using HtmlAgilityPack;
namespace Amberizer
{
public class HtmlUtilites
{
public string RemoveUnwantedTags(string data, string[] acceptableTags)
{
var document = new HtmlDocument();
document.LoadHtml(data);
var nodes = new Queue<HtmlNode>(document.DocumentNode.SelectNodes("./*|./text()"));
while (nodes.Count > 0)
{
var node = nodes.Dequeue();
var parentNode = node.ParentNode;
if (!acceptableTags.Contains(node.Name) && node.Name != "#text")
{
var childNodes = node.SelectNodes("./*|./text()");
if (childNodes != null)
{
foreach (var child in childNodes)
{
nodes.Enqueue(child);
parentNode.InsertBefore(child, node);
}
}
parentNode.RemoveChild(node);
}
}
return document.DocumentNode.InnerHtml;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment