Skip to content

Instantly share code, notes, and snippets.

@mjs3339
Created October 4, 2018 07:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjs3339/ddad5ebc2d3b5827ebb7dc030d2d50bc to your computer and use it in GitHub Desktop.
Save mjs3339/ddad5ebc2d3b5827ebb7dc030d2d50bc to your computer and use it in GitHub Desktop.
C# Strip All Imbedded HTML Content from a String
public static string StripHTMLAll(string source)
{
try
{
string result;
result = source.Replace("\r", " ");
result = result.Replace("\n", " ");
result = result.Replace("\t", " ");
result = Regex.Replace(result, @"( )+", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*head([^>])*>", "<head>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(<head>).*?(</head>)", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"(<script>).*?(</script>)", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(<style>).*?(</style>)", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*td([^>])*>", "\t", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*br( )*(/)?( )*>", "\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"<[^>]*>", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @" ", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&nbsp", " ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&bull;", " * ", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&lsaquo;", "<", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&rsaquo;", ">", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&trade;", "(tm)", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&frasl;", "/", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&lt;", "<", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&gt;", ">", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&copy;", "(c)", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&reg;", "(r)", RegexOptions.IgnoreCase);
result = Regex.Replace(result, @"&(.{2,6});", " ", RegexOptions.IgnoreCase);
result = result.Replace("\n", " ");
result = result.Replace("\r", " ");
result = result.Replace("\t", " ");
result = result.Replace("\n", "\r");
result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(\t)( )+(\t)", "\t\t", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(\t)( )+(\r)", "\t\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(\r)( )+(\t)", "\r\t", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", RegexOptions.IgnoreCase);
result = Regex.Replace(result, "(\r)(\t)+", "\r\t", RegexOptions.IgnoreCase);
var breaks = "\r\r\r";
var tabs = "\t\t\t\t\t";
for(var index = 0; index < result.Length; index++)
{
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
}
return result;
}
catch
{
return source;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment