Skip to content

Instantly share code, notes, and snippets.

@minaairsupport
Created November 30, 2018 20:57
Show Gist options
  • Save minaairsupport/b14b3aeb59e64ebf41dcf5a0af0fe161 to your computer and use it in GitHub Desktop.
Save minaairsupport/b14b3aeb59e64ebf41dcf5a0af0fe161 to your computer and use it in GitHub Desktop.
public static class HtmlToPlainConvertor
{
public static string HtmlToPlainText(string html)
{
const string tagWhiteSpace = @"(>|$)(\W|\n|\r)+<";//matches one or more (white space or line breaks) between '>' and '<'
const string stripFormatting = @"<[^>]*(>|$)";//match any character between '<' and '>', even when end tag is missing
const string lineBreak = @"<(br|BR)\s{0,1}\/{0,1}>";//matches: <br>,<br/>,<br />,<BR>,<BR/>,<BR />
var lineBreakRegex = new Regex(lineBreak, RegexOptions.Multiline);
var stripFormattingRegex = new Regex(stripFormatting, RegexOptions.Multiline);
var tagWhiteSpaceRegex = new Regex(tagWhiteSpace, RegexOptions.Multiline);
var text = html;
//Decode html specific characters
text = System.Net.WebUtility.HtmlDecode(text);
//Remove tag whitespace/line breaks
text = tagWhiteSpaceRegex.Replace(text, "><");
//Replace <br /> with line breaks
text = lineBreakRegex.Replace(text, Environment.NewLine);
//Strip formatting
text = stripFormattingRegex.Replace(text, string.Empty);
return text;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment