Skip to content

Instantly share code, notes, and snippets.

@felipebaltazar
Created September 12, 2018 16:29
Show Gist options
  • Save felipebaltazar/3a84150f5c2a86418222bfcde0a9f860 to your computer and use it in GitHub Desktop.
Save felipebaltazar/3a84150f5c2a86418222bfcde0a9f860 to your computer and use it in GitHub Desktop.
public static string FiltrarConteudoHTML(string sTexto) {
var textOnly = sTexto;
var output = new StringBuilder();
var htmlDoc = new HtmlDocument();
try {
htmlDoc.LoadHtml(sTexto);
htmlDoc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "comment" || n.Name == "div")
.ToList()
.ForEach(n => {
if (!n.InnerText.StartsWith("DOCTYPE") && n.Name != "div"
|| n.Attributes["id"]?.Value == "header")
n.Remove();
});
var linesCount = 0;
var text = htmlDoc.DocumentNode.SelectNodes("//body//text()").Select(node => node.InnerText);
foreach (string line in text) {
var result = Regex.Replace(line, @"\r\n?|\n", string.Empty);
if(linesCount < 2 && !string.IsNullOrEmpty(result) && !string.IsNullOrWhiteSpace(result)) {
output.AppendLine(result);
linesCount++;
}
}
textOnly = HttpUtility.HtmlDecode(output.ToString());
} catch (Exception ex) {
TratarErro(ex,$"{nameof(Utils)}.{nameof(FiltrarConteudoHTML)}");
}
return textOnly;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment