Last active
August 29, 2015 14:12
-
-
Save justinvp/0c1b5faf72349b56a2ed to your computer and use it in GitHub Desktop.
RegexCharClass Test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Globalization; | |
using System.Text; | |
using Xunit; | |
public class RegexCharClassTests | |
{ | |
[Fact] | |
public void TestConstants() | |
{ | |
Assert.Equal(Old._definedCategories, New.s_definedCategories); | |
Assert.Equal(Old.s_notSpace, New.s_notSpace); | |
Assert.Equal(Old.s_word, New.s_word); | |
Assert.Equal(Old.s_notWord, New.s_notWord); | |
Assert.Equal(Old.SpaceClass, New.SpaceClass); | |
Assert.Equal(Old.NotSpaceClass, New.NotSpaceClass); | |
Assert.Equal(Old.WordClass, New.WordClass); | |
Assert.Equal(Old.NotWordClass, New.NotWordClass); | |
Assert.Equal(Old.DigitClass, New.DigitClass); | |
Assert.Equal(Old.NotDigitClass, New.NotDigitClass); | |
} | |
internal sealed class New | |
{ | |
public static readonly String s_notSpace = "\uFF9C"; | |
public static readonly String s_word = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; | |
public static readonly String s_notWord = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; | |
public static readonly String SpaceClass = "\u0000\u0000\u0001\u0064"; | |
public static readonly String NotSpaceClass = "\u0001\u0000\u0001\u0064"; | |
public static readonly String WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; | |
public static readonly String NotWordClass = "\u0001\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; | |
public static readonly String DigitClass = "\u0000\u0000\u0001\u0009"; | |
public static readonly String NotDigitClass = "\u0000\u0000\u0001\uFFF7"; | |
public static readonly Dictionary<String, String> s_definedCategories = new Dictionary<String, String> | |
{ | |
// Others | |
{ "Cc", "\u000F" }, // UnicodeCategory.Control + 1 | |
{ "Cf", "\u0010" }, // UnicodeCategory.Format + 1 | |
{ "Cn", "\u001E" }, // UnicodeCategory.OtherNotAssigned + 1 | |
{ "Co", "\u0012" }, // UnicodeCategory.PrivateUse + 1 | |
{ "Cs", "\u0011" }, // UnicodeCategory.Surrogate + 1 | |
{ "C", "\u0000\u000F\u0010\u001E\u0012\u0011\u0000" }, | |
// Letters | |
{ "Ll", "\u0002" }, // UnicodeCategory.LowercaseLetter + 1 | |
{ "Lm", "\u0004" }, // UnicodeCategory.ModifierLetter + 1 | |
{ "Lo", "\u0005" }, // UnicodeCategory.OtherLetter + 1 | |
{ "Lt", "\u0003" }, // UnicodeCategory.TitlecaseLetter + 1 | |
{ "Lu", "\u0001" }, // UnicodeCategory.UppercaseLetter + 1 | |
{ "L", "\u0000\u0002\u0004\u0005\u0003\u0001\u0000" }, | |
// InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter} | |
// !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!! | |
{ "__InternalRegexIgnoreCase__", "\u0000\u0002\u0003\u0001\u0000" }, | |
// Marks | |
{ "Mc", "\u0007" }, // UnicodeCategory.SpacingCombiningMark + 1 | |
{ "Me", "\u0008" }, // UnicodeCategory.EnclosingMark + 1 | |
{ "Mn", "\u0006" }, // UnicodeCategory.NonSpacingMark + 1 | |
{ "M", "\u0000\u0007\u0008\u0006\u0000" }, | |
// Numbers | |
{ "Nd", "\u0009" }, // UnicodeCategory.DecimalDigitNumber + 1 | |
{ "Nl", "\u000A" }, // UnicodeCategory.LetterNumber + 1 | |
{ "No", "\u000B" }, // UnicodeCategory.OtherNumber + 1 | |
{ "N", "\u0000\u0009\u000A\u000B\u0000" }, | |
// Punctuation | |
{ "Pc", "\u0013" }, // UnicodeCategory.ConnectorPunctuation + 1 | |
{ "Pd", "\u0014" }, // UnicodeCategory.DashPunctuation + 1 | |
{ "Pe", "\u0016" }, // UnicodeCategory.ClosePunctuation + 1 | |
{ "Po", "\u0019" }, // UnicodeCategory.OtherPunctuation + 1 | |
{ "Ps", "\u0015" }, // UnicodeCategory.OpenPunctuation + 1 | |
{ "Pf", "\u0018" }, // UnicodeCategory.FinalQuotePunctuation + 1 | |
{ "Pi", "\u0017" }, // UnicodeCategory.InitialQuotePunctuation + 1 | |
{ "P", "\u0000\u0013\u0014\u0016\u0019\u0015\u0018\u0017\u0000" }, | |
// Symbols | |
{ "Sc", "\u001B" }, // UnicodeCategory.CurrencySymbol + 1 | |
{ "Sk", "\u001C" }, // UnicodeCategory.ModifierSymbol + 1 | |
{ "Sm", "\u001A" }, // UnicodeCategory.MathSymbol + 1 | |
{ "So", "\u001D" }, // UnicodeCategory.OtherSymbol + 1 | |
{ "S", "\u0000\u001B\u001C\u001A\u001D\u0000" }, | |
// Separators | |
{ "Zl", "\u000D" }, // UnicodeCategory.LineSeparator + 1 | |
{ "Zp", "\u000E" }, // UnicodeCategory.ParagraphSeparator + 1 | |
{ "Zs", "\u000C" }, // UnicodeCategory.SpaceSeparator + 1 | |
{ "Z", "\u0000\u000D\u000E\u000C\u0000" }, | |
}; | |
} | |
internal sealed class Old | |
{ | |
private const char GroupChar = (char)0; | |
private static readonly String s_internalRegexIgnoreCase = "__InternalRegexIgnoreCase__"; | |
private static readonly String s_space = "\x64"; | |
public static readonly String s_notSpace = NegateCategory(s_space); | |
public static readonly String s_word; | |
public static readonly String s_notWord; | |
public static readonly String SpaceClass; | |
public static readonly String NotSpaceClass; | |
public static readonly String WordClass; | |
public static readonly String NotWordClass; | |
public static readonly String DigitClass; | |
public static readonly String NotDigitClass; | |
public static Dictionary<String, String> _definedCategories; | |
static Old() | |
{ | |
// addressing Dictionary versus Hashtable thread safety difference by using | |
// a temp Dictionary. Note that this is just a theoretical concern since this | |
// is a static ctor and getter methods aren't called until after this is | |
// done; this is just to avoid the long-term possibility of thread safety | |
// problems. | |
Dictionary<String, String> tempCategories = new Dictionary<String, String>(32); | |
char[] groups = new char[9]; | |
StringBuilder word = new StringBuilder(11); | |
word.Append(GroupChar); | |
groups[0] = GroupChar; | |
// We need the UnicodeCategory enum values as a char so we can put them in a string | |
// in the hashtable. In order to get there, we first must cast to an int, | |
// then cast to a char | |
// Also need to distinguish between positive and negative values. UnicodeCategory is zero | |
// based, so we add one to each value and subtract it off later | |
// Others | |
groups[1] = (char)((int)UnicodeCategory.Control + 1); | |
tempCategories["Cc"] = groups[1].ToString(); // Control | |
groups[2] = (char)((int)UnicodeCategory.Format + 1); | |
tempCategories["Cf"] = groups[2].ToString(); // Format | |
groups[3] = (char)((int)UnicodeCategory.OtherNotAssigned + 1); | |
tempCategories["Cn"] = groups[3].ToString(); // Not assigned | |
groups[4] = (char)((int)UnicodeCategory.PrivateUse + 1); | |
tempCategories["Co"] = groups[4].ToString(); // Private use | |
groups[5] = (char)((int)UnicodeCategory.Surrogate + 1); | |
tempCategories["Cs"] = groups[5].ToString(); // Surrogate | |
groups[6] = GroupChar; | |
tempCategories["C"] = new String(groups, 0, 7); | |
// Letters | |
groups[1] = (char)((int)UnicodeCategory.LowercaseLetter + 1); | |
tempCategories["Ll"] = groups[1].ToString(); // Lowercase | |
groups[2] = (char)((int)UnicodeCategory.ModifierLetter + 1); | |
tempCategories["Lm"] = groups[2].ToString(); // Modifier | |
groups[3] = (char)((int)UnicodeCategory.OtherLetter + 1); | |
tempCategories["Lo"] = groups[3].ToString(); // Other | |
groups[4] = (char)((int)UnicodeCategory.TitlecaseLetter + 1); | |
tempCategories["Lt"] = groups[4].ToString(); // Titlecase | |
groups[5] = (char)((int)UnicodeCategory.UppercaseLetter + 1); | |
tempCategories["Lu"] = groups[5].ToString(); // Uppercase | |
//groups[6] = GroupChar; | |
tempCategories["L"] = new String(groups, 0, 7); | |
word.Append(new String(groups, 1, 5)); | |
// InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter} | |
// !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!! | |
StringBuilder sb = new StringBuilder(5); | |
sb.Append(GroupChar); | |
sb.Append(groups[1]); | |
sb.Append(groups[4]); | |
sb.Append(groups[5]); | |
sb.Append(groups[6]); | |
tempCategories[s_internalRegexIgnoreCase] = sb.ToString(); | |
// Marks | |
groups[1] = (char)((int)UnicodeCategory.SpacingCombiningMark + 1); | |
tempCategories["Mc"] = groups[1].ToString(); // Spacing combining | |
groups[2] = (char)((int)UnicodeCategory.EnclosingMark + 1); | |
tempCategories["Me"] = groups[2].ToString(); // Enclosing | |
groups[3] = (char)((int)UnicodeCategory.NonSpacingMark + 1); | |
tempCategories["Mn"] = groups[3].ToString(); // Non-spacing | |
groups[4] = GroupChar; | |
tempCategories["M"] = new String(groups, 0, 5); | |
//word.Append(groups[1]); | |
word.Append(groups[3]); | |
// Numbers | |
groups[1] = (char)((int)UnicodeCategory.DecimalDigitNumber + 1); | |
tempCategories["Nd"] = groups[1].ToString(); // Decimal digit | |
groups[2] = (char)((int)UnicodeCategory.LetterNumber + 1); | |
tempCategories["Nl"] = groups[2].ToString(); // Letter | |
groups[3] = (char)((int)UnicodeCategory.OtherNumber + 1); | |
tempCategories["No"] = groups[3].ToString(); // Other | |
//groups[4] = GroupChar; | |
tempCategories["N"] = new String(groups, 0, 5); | |
word.Append(groups[1]); | |
//word.Append(new String(groups, 1, 3)); | |
// Punctuation | |
groups[1] = (char)((int)UnicodeCategory.ConnectorPunctuation + 1); | |
tempCategories["Pc"] = groups[1].ToString(); // Connector | |
groups[2] = (char)((int)UnicodeCategory.DashPunctuation + 1); | |
tempCategories["Pd"] = groups[2].ToString(); // Dash | |
groups[3] = (char)((int)UnicodeCategory.ClosePunctuation + 1); | |
tempCategories["Pe"] = groups[3].ToString(); // Close | |
groups[4] = (char)((int)UnicodeCategory.OtherPunctuation + 1); | |
tempCategories["Po"] = groups[4].ToString(); // Other | |
groups[5] = (char)((int)UnicodeCategory.OpenPunctuation + 1); | |
tempCategories["Ps"] = groups[5].ToString(); // Open | |
groups[6] = (char)((int)UnicodeCategory.FinalQuotePunctuation + 1); | |
tempCategories["Pf"] = groups[6].ToString(); // Inital quote | |
groups[7] = (char)((int)UnicodeCategory.InitialQuotePunctuation + 1); | |
tempCategories["Pi"] = groups[7].ToString(); // Final quote | |
groups[8] = GroupChar; | |
tempCategories["P"] = new String(groups, 0, 9); | |
word.Append(groups[1]); | |
// Symbols | |
groups[1] = (char)((int)UnicodeCategory.CurrencySymbol + 1); | |
tempCategories["Sc"] = groups[1].ToString(); // Currency | |
groups[2] = (char)((int)UnicodeCategory.ModifierSymbol + 1); | |
tempCategories["Sk"] = groups[2].ToString(); // Modifier | |
groups[3] = (char)((int)UnicodeCategory.MathSymbol + 1); | |
tempCategories["Sm"] = groups[3].ToString(); // Math | |
groups[4] = (char)((int)UnicodeCategory.OtherSymbol + 1); | |
tempCategories["So"] = groups[4].ToString(); // Other | |
groups[5] = GroupChar; | |
tempCategories["S"] = new String(groups, 0, 6); | |
// Separators | |
groups[1] = (char)((int)UnicodeCategory.LineSeparator + 1); | |
tempCategories["Zl"] = groups[1].ToString(); // Line | |
groups[2] = (char)((int)UnicodeCategory.ParagraphSeparator + 1); | |
tempCategories["Zp"] = groups[2].ToString(); // Paragraph | |
groups[3] = (char)((int)UnicodeCategory.SpaceSeparator + 1); | |
tempCategories["Zs"] = groups[3].ToString(); // Space | |
groups[4] = GroupChar; | |
tempCategories["Z"] = new String(groups, 0, 5); | |
word.Append(GroupChar); | |
s_word = word.ToString(); | |
s_notWord = NegateCategory(s_word); | |
SpaceClass = "\x00\x00\x01" + s_space; | |
NotSpaceClass = "\x01\x00\x01" + s_space; | |
WordClass = "\x00\x00" + (char)s_word.Length + s_word; | |
NotWordClass = "\x01\x00" + (char)s_word.Length + s_word; ; | |
DigitClass = "\x00\x00\x01" + (char)((int)UnicodeCategory.DecimalDigitNumber + 1); | |
NotDigitClass = "\x00\x00\x01" + unchecked((char)(-((int)UnicodeCategory.DecimalDigitNumber + 1))); | |
_definedCategories = tempCategories; | |
} | |
private static string NegateCategory(string category) | |
{ | |
if (category == null) | |
return null; | |
StringBuilder sb = new StringBuilder(category.Length); | |
for (int i = 0; i < category.Length; i++) | |
{ | |
short ch = (short)category[i]; | |
sb.Append((char)-ch); | |
} | |
return sb.ToString(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment