Skip to content

Instantly share code, notes, and snippets.

@jokecamp
Created November 18, 2013 14:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jokecamp/7529013 to your computer and use it in GitHub Desktop.
Save jokecamp/7529013 to your computer and use it in GitHub Desktop.
Various ways of sanitizing XML input
/// http://seattlesoftware.wordpress.com/2008/09/11/hexadecimal-value-0-is-an-invalid-character/
/// http://stackoverflow.com/questions/157646/best-way-to-encode-text-data-for-xml/732135#732135
public string Clean(string text)
{
return new string(text.Where(XmlConvert.IsXmlChar).ToArray());
}
public static string CleanInvalidXmlChars(string text)
{
// From xml spec valid chars:
// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
// any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
string re = @"[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\x10000-x10FFFF]";
return Regex.Replace(text, re, "");
}
private string SanitizeXmlString(string xml)
{
if (xml == null)
{
throw new ArgumentNullException("xml");
}
var buffer = new StringBuilder(xml.Length);
foreach (char c in xml)
{
if (IsLegalXmlChar(c))
{
buffer.Append(c);
}
}
return buffer.ToString();
}
private bool IsLegalXmlChar(int character)
{
return
(
character == 0x9 /* == '\t' == 9 */ ||
character == 0xA /* == '\n' == 10 */ ||
character == 0xD /* == '\r' == 13 */ ||
(character >= 0x20 && character <= 0xD7FF) ||
(character >= 0xE000 && character <= 0xFFFD) ||
(character >= 0x10000 && character <= 0x10FFFF)
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment