You can read all details at: Convert HTML to Plain Text or Extract Text from HTML in C#
Created
January 18, 2021 21:25
-
-
Save aspose-com-gists/1608e3c92e0a1838dac8a0c351567969 to your computer and use it in GitHub Desktop.
Convert HTML webpage to Plain Text or Extract Text from HTML in C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
String text = ""; | |
String content = File.ReadAllText(dataDir + "Test2.html"); | |
// Create an instance of HTML document | |
var document = new HTMLDocument(content, ""); | |
// The second way of gathering text elements from document by using custom method | |
text = GetContent(document.Body); | |
File.WriteAllText("Test.txt", text); | |
// The third way of gathering text elements from document by using TextContent property | |
text = document.Body.TextContent; | |
File.WriteAllText("Test.txt", text); | |
static string GetContent(Aspose.Html.Dom.Node node) | |
{ | |
StringBuilder sb = new StringBuilder(); | |
foreach (var n in node.ChildNodes) | |
{ | |
if (n.NodeType == Aspose.Html.Dom.Node.ELEMENT_NODE) | |
sb.Append(GetContent(n)); | |
else if (n.NodeType == Aspose.Html.Dom.Node.TEXT_NODE) | |
sb.Append(n.NodeValue); | |
} | |
return sb.ToString(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
String content = File.ReadAllText(dataDir + "Test2.html"); | |
// Create an instance of HTML document | |
using (var document = new HTMLDocument(content, "")) | |
{ | |
// The first way of gathering text elements from document | |
// Initialize the instance of node iterator | |
Aspose.Html.Dom.Traversal.INodeIterator iterator = document.CreateNodeIterator(document, Aspose.Html.Dom.Traversal.Filters.NodeFilter.SHOW_TEXT, new StyleFilter()); | |
StringBuilder sb = new StringBuilder(); | |
Aspose.Html.Dom.Node node; | |
while ((node = iterator.NextNode()) != null) | |
sb.Append(node.NodeValue); | |
Console.WriteLine(sb.ToString()); | |
File.WriteAllText(@"NodeIterator.txt", sb.ToString()); | |
} | |
/// <summary> | |
/// Represents a user filter created in order to ignore content of the 'style' and 'script' element. | |
/// </summary> | |
class StyleFilter : Aspose.Html.Dom.Traversal.Filters.NodeFilter | |
{ | |
public override short AcceptNode(Aspose.Html.Dom.Node n) | |
{ | |
//If you want to avoid any element, write its name in capital letters | |
return (n.ParentElement.TagName == "STYLE" || n.ParentElement.TagName == "SCRIPT" ? FILTER_REJECT : FILTER_ACCEPT); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Initialize HTMLDocument object with URL | |
HTMLDocument document = new HTMLDocument("https://products.aspose.com/html/net"); | |
// Read the text contents of the HTML format | |
String text = document.Body.TextContent; | |
// Write the TXT file with extracted text | |
File.WriteAllText("Webpage.txt", text); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment