Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Convert HTML webpage to Plain Text or Extract Text from HTML in C#
String text = "";
String content = File.ReadAllText(dataDir + "Test2.html");
// Create an instance of HTML document
var document = new HTMLDocument(content, "");
// The second way of gathering text elements from document by using custom method
text = GetContent(document.Body);
File.WriteAllText("Test.txt", text);
// The third way of gathering text elements from document by using TextContent property
text = document.Body.TextContent;
File.WriteAllText("Test.txt", text);
static string GetContent(Aspose.Html.Dom.Node node)
{
StringBuilder sb = new StringBuilder();
foreach (var n in node.ChildNodes)
{
if (n.NodeType == Aspose.Html.Dom.Node.ELEMENT_NODE)
sb.Append(GetContent(n));
else if (n.NodeType == Aspose.Html.Dom.Node.TEXT_NODE)
sb.Append(n.NodeValue);
}
return sb.ToString();
}
String content = File.ReadAllText(dataDir + "Test2.html");
// Create an instance of HTML document
using (var document = new HTMLDocument(content, ""))
{
// The first way of gathering text elements from document
// Initialize the instance of node iterator
Aspose.Html.Dom.Traversal.INodeIterator iterator = document.CreateNodeIterator(document, Aspose.Html.Dom.Traversal.Filters.NodeFilter.SHOW_TEXT, new StyleFilter());
StringBuilder sb = new StringBuilder();
Aspose.Html.Dom.Node node;
while ((node = iterator.NextNode()) != null)
sb.Append(node.NodeValue);
Console.WriteLine(sb.ToString());
File.WriteAllText(@"NodeIterator.txt", sb.ToString());
}
/// <summary>
/// Represents a user filter created in order to ignore content of the 'style' and 'script' element.
/// </summary>
class StyleFilter : Aspose.Html.Dom.Traversal.Filters.NodeFilter
{
public override short AcceptNode(Aspose.Html.Dom.Node n)
{
//If you want to avoid any element, write its name in capital letters
return (n.ParentElement.TagName == "STYLE" || n.ParentElement.TagName == "SCRIPT" ? FILTER_REJECT : FILTER_ACCEPT);
}
}
// Initialize HTMLDocument object with URL
HTMLDocument document = new HTMLDocument("https://products.aspose.com/html/net");
// Read the text contents of the HTML format
String text = document.Body.TextContent;
// Write the TXT file with extracted text
File.WriteAllText("Webpage.txt", text);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment