Skip to content

Instantly share code, notes, and snippets.

@soeminnminn
Created May 9, 2017 16:00
Show Gist options
  • Save soeminnminn/eeace31c71d4eb569459b4893160290b to your computer and use it in GitHub Desktop.
Save soeminnminn/eeace31c71d4eb569459b4893160290b to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace S16.Text
{
public class UniEncodeDecode
{
public UniEncodeDecode()
{
}
public string Encode(string text)
{
if (string.IsNullOrEmpty(text)) return string.Empty;
string output = string.Empty;
for (int i = 0; i < text.Length; i++)
{
char ch = text[i];
if (ch < (char)0xFF)
{
output += ch.ToString();
}
else
{
output += "\\u" + ((int)ch).ToString("X");
}
}
return output;
}
public string EncodeXml(string text)
{
if (string.IsNullOrEmpty(text)) return string.Empty;
string output = string.Empty;
for (int i = 0; i < text.Length; i++)
{
char ch = text[i];
if (ch < (char)0xFF)
{
output += ch.ToString();
}
else
{
output += "&#x" + ((int)ch).ToString("X") + ";";
}
}
return output;
}
public string Decode(string text)
{
if (string.IsNullOrEmpty(text)) return string.Empty;
string output = string.Empty;
Regex regex = new Regex("\\u([0-9A_F]{4})", RegexOptions.None);
Match match = regex.Match(text);
while (match.Success)
{
match = match.NextMatch();
}
return output;
}
internal static class HtmlUtils
{
#region Fields and Consts
/// <summary>
/// List of html tags that don't have content
/// </summary>
private static readonly List<string> _list = new List<string>(
new[]
{
"area", "base", "basefont", "br", "col",
"frame", "hr", "img", "input", "isindex",
"link", "meta", "param"
}
);
/// <summary>
/// the html encode\decode pairs
/// </summary>
private static readonly KeyValuePair<string, string>[] _encodeDecode = new[]
{
new KeyValuePair<string, string>("&lt;", "<"),
new KeyValuePair<string, string>("&gt;", ">"),
new KeyValuePair<string, string>("&quot;", "\""),
new KeyValuePair<string, string>("&amp;", "&"),
};
/// <summary>
/// the html decode only pairs
/// </summary>
private static readonly Dictionary<string, char> _decodeOnly = new Dictionary<string, char>(StringComparer.InvariantCultureIgnoreCase);
#endregion
/// <summary>
/// Init.
/// </summary>
static HtmlUtils()
{
_decodeOnly["nbsp"] = ' ';
_decodeOnly["rdquo"] = '"';
_decodeOnly["lsquo"] = '\'';
_decodeOnly["apos"] = '\'';
// ISO 8859-1 Symbols
_decodeOnly["iexcl"] = Convert.ToChar(161);
_decodeOnly["cent"] = Convert.ToChar(162);
_decodeOnly["pound"] = Convert.ToChar(163);
_decodeOnly["curren"] = Convert.ToChar(164);
_decodeOnly["yen"] = Convert.ToChar(165);
_decodeOnly["brvbar"] = Convert.ToChar(166);
_decodeOnly["sect"] = Convert.ToChar(167);
_decodeOnly["uml"] = Convert.ToChar(168);
_decodeOnly["copy"] = Convert.ToChar(169);
_decodeOnly["ordf"] = Convert.ToChar(170);
_decodeOnly["laquo"] = Convert.ToChar(171);
_decodeOnly["not"] = Convert.ToChar(172);
_decodeOnly["shy"] = Convert.ToChar(173);
_decodeOnly["reg"] = Convert.ToChar(174);
_decodeOnly["macr"] = Convert.ToChar(175);
_decodeOnly["deg"] = Convert.ToChar(176);
_decodeOnly["plusmn"] = Convert.ToChar(177);
_decodeOnly["sup2"] = Convert.ToChar(178);
_decodeOnly["sup3"] = Convert.ToChar(179);
_decodeOnly["acute"] = Convert.ToChar(180);
_decodeOnly["micro"] = Convert.ToChar(181);
_decodeOnly["para"] = Convert.ToChar(182);
_decodeOnly["middot"] = Convert.ToChar(183);
_decodeOnly["cedil"] = Convert.ToChar(184);
_decodeOnly["sup1"] = Convert.ToChar(185);
_decodeOnly["ordm"] = Convert.ToChar(186);
_decodeOnly["raquo"] = Convert.ToChar(187);
_decodeOnly["frac14"] = Convert.ToChar(188);
_decodeOnly["frac12"] = Convert.ToChar(189);
_decodeOnly["frac34"] = Convert.ToChar(190);
_decodeOnly["iquest"] = Convert.ToChar(191);
_decodeOnly["times"] = Convert.ToChar(215);
_decodeOnly["divide"] = Convert.ToChar(247);
// ISO 8859-1 Characters
_decodeOnly["Agrave"] = Convert.ToChar(192);
_decodeOnly["Aacute"] = Convert.ToChar(193);
_decodeOnly["Acirc"] = Convert.ToChar(194);
_decodeOnly["Atilde"] = Convert.ToChar(195);
_decodeOnly["Auml"] = Convert.ToChar(196);
_decodeOnly["Aring"] = Convert.ToChar(197);
_decodeOnly["AElig"] = Convert.ToChar(198);
_decodeOnly["Ccedil"] = Convert.ToChar(199);
_decodeOnly["Egrave"] = Convert.ToChar(200);
_decodeOnly["Eacute"] = Convert.ToChar(201);
_decodeOnly["Ecirc"] = Convert.ToChar(202);
_decodeOnly["Euml"] = Convert.ToChar(203);
_decodeOnly["Igrave"] = Convert.ToChar(204);
_decodeOnly["Iacute"] = Convert.ToChar(205);
_decodeOnly["Icirc"] = Convert.ToChar(206);
_decodeOnly["Iuml"] = Convert.ToChar(207);
_decodeOnly["ETH"] = Convert.ToChar(208);
_decodeOnly["Ntilde"] = Convert.ToChar(209);
_decodeOnly["Ograve"] = Convert.ToChar(210);
_decodeOnly["Oacute"] = Convert.ToChar(211);
_decodeOnly["Ocirc"] = Convert.ToChar(212);
_decodeOnly["Otilde"] = Convert.ToChar(213);
_decodeOnly["Ouml"] = Convert.ToChar(214);
_decodeOnly["Oslash"] = Convert.ToChar(216);
_decodeOnly["Ugrave"] = Convert.ToChar(217);
_decodeOnly["Uacute"] = Convert.ToChar(218);
_decodeOnly["Ucirc"] = Convert.ToChar(219);
_decodeOnly["Uuml"] = Convert.ToChar(220);
_decodeOnly["Yacute"] = Convert.ToChar(221);
_decodeOnly["THORN"] = Convert.ToChar(222);
_decodeOnly["szlig"] = Convert.ToChar(223);
_decodeOnly["agrave"] = Convert.ToChar(224);
_decodeOnly["aacute"] = Convert.ToChar(225);
_decodeOnly["acirc"] = Convert.ToChar(226);
_decodeOnly["atilde"] = Convert.ToChar(227);
_decodeOnly["auml"] = Convert.ToChar(228);
_decodeOnly["aring"] = Convert.ToChar(229);
_decodeOnly["aelig"] = Convert.ToChar(230);
_decodeOnly["ccedil"] = Convert.ToChar(231);
_decodeOnly["egrave"] = Convert.ToChar(232);
_decodeOnly["eacute"] = Convert.ToChar(233);
_decodeOnly["ecirc"] = Convert.ToChar(234);
_decodeOnly["euml"] = Convert.ToChar(235);
_decodeOnly["igrave"] = Convert.ToChar(236);
_decodeOnly["iacute"] = Convert.ToChar(237);
_decodeOnly["icirc"] = Convert.ToChar(238);
_decodeOnly["iuml"] = Convert.ToChar(239);
_decodeOnly["eth"] = Convert.ToChar(240);
_decodeOnly["ntilde"] = Convert.ToChar(241);
_decodeOnly["ograve"] = Convert.ToChar(242);
_decodeOnly["oacute"] = Convert.ToChar(243);
_decodeOnly["ocirc"] = Convert.ToChar(244);
_decodeOnly["otilde"] = Convert.ToChar(245);
_decodeOnly["ouml"] = Convert.ToChar(246);
_decodeOnly["oslash"] = Convert.ToChar(248);
_decodeOnly["ugrave"] = Convert.ToChar(249);
_decodeOnly["uacute"] = Convert.ToChar(250);
_decodeOnly["ucirc"] = Convert.ToChar(251);
_decodeOnly["uuml"] = Convert.ToChar(252);
_decodeOnly["yacute"] = Convert.ToChar(253);
_decodeOnly["thorn"] = Convert.ToChar(254);
_decodeOnly["yuml"] = Convert.ToChar(255);
// Math Symbols Supported by HTML
_decodeOnly["forall"] = Convert.ToChar(8704);
_decodeOnly["part"] = Convert.ToChar(8706);
_decodeOnly["exist"] = Convert.ToChar(8707);
_decodeOnly["empty"] = Convert.ToChar(8709);
_decodeOnly["nabla"] = Convert.ToChar(8711);
_decodeOnly["isin"] = Convert.ToChar(8712);
_decodeOnly["notin"] = Convert.ToChar(8713);
_decodeOnly["ni"] = Convert.ToChar(8715);
_decodeOnly["prod"] = Convert.ToChar(8719);
_decodeOnly["sum"] = Convert.ToChar(8721);
_decodeOnly["minus"] = Convert.ToChar(8722);
_decodeOnly["lowast"] = Convert.ToChar(8727);
_decodeOnly["radic"] = Convert.ToChar(8730);
_decodeOnly["prop"] = Convert.ToChar(8733);
_decodeOnly["infin"] = Convert.ToChar(8734);
_decodeOnly["ang"] = Convert.ToChar(8736);
_decodeOnly["and"] = Convert.ToChar(8743);
_decodeOnly["or"] = Convert.ToChar(8744);
_decodeOnly["cap"] = Convert.ToChar(8745);
_decodeOnly["cup"] = Convert.ToChar(8746);
_decodeOnly["int"] = Convert.ToChar(8747);
_decodeOnly["there4"] = Convert.ToChar(8756);
_decodeOnly["sim"] = Convert.ToChar(8764);
_decodeOnly["cong"] = Convert.ToChar(8773);
_decodeOnly["asymp"] = Convert.ToChar(8776);
_decodeOnly["ne"] = Convert.ToChar(8800);
_decodeOnly["equiv"] = Convert.ToChar(8801);
_decodeOnly["le"] = Convert.ToChar(8804);
_decodeOnly["ge"] = Convert.ToChar(8805);
_decodeOnly["sub"] = Convert.ToChar(8834);
_decodeOnly["sup"] = Convert.ToChar(8835);
_decodeOnly["nsub"] = Convert.ToChar(8836);
_decodeOnly["sube"] = Convert.ToChar(8838);
_decodeOnly["supe"] = Convert.ToChar(8839);
_decodeOnly["oplus"] = Convert.ToChar(8853);
_decodeOnly["otimes"] = Convert.ToChar(8855);
_decodeOnly["perp"] = Convert.ToChar(8869);
_decodeOnly["sdot"] = Convert.ToChar(8901);
// Greek Letters Supported by HTML
_decodeOnly["Alpha"] = Convert.ToChar(913);
_decodeOnly["Beta"] = Convert.ToChar(914);
_decodeOnly["Gamma"] = Convert.ToChar(915);
_decodeOnly["Delta"] = Convert.ToChar(916);
_decodeOnly["Epsilon"] = Convert.ToChar(917);
_decodeOnly["Zeta"] = Convert.ToChar(918);
_decodeOnly["Eta"] = Convert.ToChar(919);
_decodeOnly["Theta"] = Convert.ToChar(920);
_decodeOnly["Iota"] = Convert.ToChar(921);
_decodeOnly["Kappa"] = Convert.ToChar(922);
_decodeOnly["Lambda"] = Convert.ToChar(923);
_decodeOnly["Mu"] = Convert.ToChar(924);
_decodeOnly["Nu"] = Convert.ToChar(925);
_decodeOnly["Xi"] = Convert.ToChar(926);
_decodeOnly["Omicron"] = Convert.ToChar(927);
_decodeOnly["Pi"] = Convert.ToChar(928);
_decodeOnly["Rho"] = Convert.ToChar(929);
_decodeOnly["Sigma"] = Convert.ToChar(931);
_decodeOnly["Tau"] = Convert.ToChar(932);
_decodeOnly["Upsilon"] = Convert.ToChar(933);
_decodeOnly["Phi"] = Convert.ToChar(934);
_decodeOnly["Chi"] = Convert.ToChar(935);
_decodeOnly["Psi"] = Convert.ToChar(936);
_decodeOnly["Omega"] = Convert.ToChar(937);
_decodeOnly["alpha"] = Convert.ToChar(945);
_decodeOnly["beta"] = Convert.ToChar(946);
_decodeOnly["gamma"] = Convert.ToChar(947);
_decodeOnly["delta"] = Convert.ToChar(948);
_decodeOnly["epsilon"] = Convert.ToChar(949);
_decodeOnly["zeta"] = Convert.ToChar(950);
_decodeOnly["eta"] = Convert.ToChar(951);
_decodeOnly["theta"] = Convert.ToChar(952);
_decodeOnly["iota"] = Convert.ToChar(953);
_decodeOnly["kappa"] = Convert.ToChar(954);
_decodeOnly["lambda"] = Convert.ToChar(955);
_decodeOnly["mu"] = Convert.ToChar(956);
_decodeOnly["nu"] = Convert.ToChar(957);
_decodeOnly["xi"] = Convert.ToChar(958);
_decodeOnly["omicron"] = Convert.ToChar(959);
_decodeOnly["pi"] = Convert.ToChar(960);
_decodeOnly["rho"] = Convert.ToChar(961);
_decodeOnly["sigmaf"] = Convert.ToChar(962);
_decodeOnly["sigma"] = Convert.ToChar(963);
_decodeOnly["tau"] = Convert.ToChar(964);
_decodeOnly["upsilon"] = Convert.ToChar(965);
_decodeOnly["phi"] = Convert.ToChar(966);
_decodeOnly["chi"] = Convert.ToChar(967);
_decodeOnly["psi"] = Convert.ToChar(968);
_decodeOnly["omega"] = Convert.ToChar(969);
_decodeOnly["thetasym"] = Convert.ToChar(977);
_decodeOnly["upsih"] = Convert.ToChar(978);
_decodeOnly["piv"] = Convert.ToChar(982);
// Other Entities Supported by HTML
_decodeOnly["OElig"] = Convert.ToChar(338);
_decodeOnly["oelig"] = Convert.ToChar(339);
_decodeOnly["Scaron"] = Convert.ToChar(352);
_decodeOnly["scaron"] = Convert.ToChar(353);
_decodeOnly["Yuml"] = Convert.ToChar(376);
_decodeOnly["fnof"] = Convert.ToChar(402);
_decodeOnly["circ"] = Convert.ToChar(710);
_decodeOnly["tilde"] = Convert.ToChar(732);
_decodeOnly["ndash"] = Convert.ToChar(8211);
_decodeOnly["mdash"] = Convert.ToChar(8212);
_decodeOnly["lsquo"] = Convert.ToChar(8216);
_decodeOnly["rsquo"] = Convert.ToChar(8217);
_decodeOnly["sbquo"] = Convert.ToChar(8218);
_decodeOnly["ldquo"] = Convert.ToChar(8220);
_decodeOnly["rdquo"] = Convert.ToChar(8221);
_decodeOnly["bdquo"] = Convert.ToChar(8222);
_decodeOnly["dagger"] = Convert.ToChar(8224);
_decodeOnly["Dagger"] = Convert.ToChar(8225);
_decodeOnly["bull"] = Convert.ToChar(8226);
_decodeOnly["hellip"] = Convert.ToChar(8230);
_decodeOnly["permil"] = Convert.ToChar(8240);
_decodeOnly["prime"] = Convert.ToChar(8242);
_decodeOnly["Prime"] = Convert.ToChar(8243);
_decodeOnly["lsaquo"] = Convert.ToChar(8249);
_decodeOnly["rsaquo"] = Convert.ToChar(8250);
_decodeOnly["oline"] = Convert.ToChar(8254);
_decodeOnly["euro"] = Convert.ToChar(8364);
_decodeOnly["trade"] = Convert.ToChar(153);
_decodeOnly["larr"] = Convert.ToChar(8592);
_decodeOnly["uarr"] = Convert.ToChar(8593);
_decodeOnly["rarr"] = Convert.ToChar(8594);
_decodeOnly["darr"] = Convert.ToChar(8595);
_decodeOnly["harr"] = Convert.ToChar(8596);
_decodeOnly["crarr"] = Convert.ToChar(8629);
_decodeOnly["lceil"] = Convert.ToChar(8968);
_decodeOnly["rceil"] = Convert.ToChar(8969);
_decodeOnly["lfloor"] = Convert.ToChar(8970);
_decodeOnly["rfloor"] = Convert.ToChar(8971);
_decodeOnly["loz"] = Convert.ToChar(9674);
_decodeOnly["spades"] = Convert.ToChar(9824);
_decodeOnly["clubs"] = Convert.ToChar(9827);
_decodeOnly["hearts"] = Convert.ToChar(9829);
_decodeOnly["diams"] = Convert.ToChar(9830);
}
/// <summary>
/// Is the given html tag is single tag or can have content.
/// </summary>
/// <param name="tagName">the tag to check (must be lower case)</param>
/// <returns>true - is single tag, false - otherwise</returns>
public static bool IsSingleTag(string tagName)
{
return _list.Contains(tagName);
}
/// <summary>
/// Decode html encoded string to regular string.<br/>
/// Handles &lt;, &gt;, "&amp;.
/// </summary>
/// <param name="str">the string to decode</param>
/// <returns>decoded string</returns>
public static string DecodeHtml(string str)
{
if (!string.IsNullOrEmpty(str))
{
str = DecodeHtmlCharByCode(str);
str = DecodeHtmlCharByName(str);
foreach (KeyValuePair<string, string> encPair in _encodeDecode)
{
str = str.Replace(encPair.Key, encPair.Value);
}
}
return str;
}
/// <summary>
/// Encode regular string into html encoded string.<br/>
/// Handles &lt;, &gt;, "&amp;.
/// </summary>
/// <param name="str">the string to encode</param>
/// <returns>encoded string</returns>
public static string EncodeHtml(string str)
{
if (!string.IsNullOrEmpty(str))
{
for (int i = _encodeDecode.Length - 1; i >= 0; i--)
{
str = str.Replace(_encodeDecode[i].Value, _encodeDecode[i].Key);
}
}
return str;
}
#region Private methods
/// <summary>
/// Check if the given char is a digit character (0-9) and (0-9, a-f for HEX)
/// </summary>
/// <param name="ch">the character to check</param>
/// <param name="hex">optional: is hex digit check</param>
/// <returns>true - is digit, false - not a digit</returns>
private static bool IsDigit(char ch, bool hex)
{
return ( ch >= '0' && ch <= '9' ) || ( hex && ( ( ch >= 'a' && ch <= 'f' ) || ( ch >= 'A' && ch <= 'F' ) ) );
}
/// <summary>
/// Convert the given char to digit.
/// </summary>
/// <param name="ch">the character to check</param>
/// <param name="hex">optional: is hex digit check</param>
/// <returns>true - is digit, false - not a digit</returns>
private static int ToDigit(char ch, bool hex)
{
if( ch >= '0' && ch <= '9' )
return ch - '0';
else if( hex )
{
if( ch >= 'a' && ch <= 'f' )
return ch - 'a' + 10;
else if(ch >= 'A' && ch <= 'F')
return ch - 'A' + 10;
}
return 0;
}
/// <summary>
/// Decode html special charecters encoded using char entity code (&#8364;)
/// </summary>
/// <param name="str">the string to decode</param>
/// <returns>decoded string</returns>
private static string DecodeHtmlCharByCode(string str)
{
int idx = str.IndexOf("&#", StringComparison.OrdinalIgnoreCase);
while (idx > -1)
{
bool hex = str.Length > idx + 3 && char.ToLower(str[idx + 2]) == 'x';
int endIdx = idx + 2 + (hex ? 1 : 0);
long num = 0;
while (endIdx < str.Length && IsDigit(str[endIdx], hex))
num = num * (hex ? 16 : 10) + ToDigit(str[endIdx++], hex);
endIdx += (endIdx < str.Length && str[endIdx] == ';') ? 1 : 0;
str = str.Remove(idx, endIdx - idx);
str = str.Insert(idx, Convert.ToChar(num).ToString());
idx = str.IndexOf("&#", idx + 1);
}
return str;
}
/// <summary>
/// Decode html special charecters encoded using char entity name (&#euro;)
/// </summary>
/// <param name="str">the string to decode</param>
/// <returns>decoded string</returns>
private static string DecodeHtmlCharByName(string str)
{
int idx = str.IndexOf('&');
while (idx > -1)
{
int endIdx = str.IndexOf(';', idx);
if (endIdx > -1 && endIdx - idx < 8)
{
string key = str.Substring(idx + 1, endIdx - idx - 1);
char c;
if (_decodeOnly.TryGetValue(key, out c))
{
str = str.Remove(idx, endIdx - idx + 1);
str = str.Insert(idx, c.ToString());
}
}
idx = str.IndexOf('&', idx + 1);
}
return str;
}
#endregion
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment