Skip to content

Instantly share code, notes, and snippets.

@FabienDehopre
Last active January 27, 2021 08:23
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save FabienDehopre/5245476 to your computer and use it in GitHub Desktop.
Save FabienDehopre/5245476 to your computer and use it in GitHub Desktop.
Validate C# identifier name
using System;
using System.Linq;
using System.Text.RegularExpressions;
public static class IdentifierExtensions
{
// definition of a valid C# identifier: http://msdn.microsoft.com/en-us/library/aa664670(v=vs.71).aspx
private const string FORMATTING_CHARACTER = @"\p{Cf}";
private const string CONNECTING_CHARACTER = @"\p{Pc}";
private const string DECIMAL_DIGIT_CHARACTER = @"\p{Nd}";
private const string COMBINING_CHARACTER = @"\p{Mn}|\p{Mc}";
private const string LETTER_CHARACTER = @"\p{Lu}|\p{Ll}|\p{Lt}|\p{Lm}|\p{Lo}|\p{Nl}";
private const string IDENTIFIER_PART_CHARACTER = LETTER_CHARACTER + "|" +
DECIMAL_DIGIT_CHARACTER + "|" +
CONNECTING_CHARACTER + "|" +
COMBINING_CHARACTER + "|" +
FORMATTING_CHARACTER;
private const string IDENTIFIER_PART_CHARACTERS = "(" + IDENTIFIER_PART_CHARACTER + ")+";
private const string IDENTIFIER_START_CHARACTER = "(" + LETTER_CHARACTER + "|_)";
private const string IDENTIFIER_OR_KEYWORD = IDENTIFIER_START_CHARACTER + "(" +
IDENTIFIER_PART_CHARACTERS + ")*";
// C# keywords: http://msdn.microsoft.com/en-us/library/x53a06bb(v=vs.71).aspx
private static readonly HashSet<string> _keywords = new HashSet<string>
{
"__arglist", "__makeref", "__reftype", "__refvalue",
"abstract", "as", "base", "bool",
"break", "byte", "case", "catch",
"char", "checked", "class", "const",
"continue", "decimal", "default", "delegate",
"do", "double", "else", "enum",
"event", "explicit", "extern", "false",
"finally", "fixed", "float", "for",
"foreach", "goto", "if", "implicit",
"in", "int", "interface", "internal",
"is", "lock", "long", "namespace",
"new", "null", "object", "operator",
"out", "override", "params", "private",
"protected", "public", "readonly", "ref",
"return", "sbyte", "sealed", "short",
"sizeof", "stackalloc", "static", "string",
"struct", "switch", "this", "throw",
"true", "try", "typeof", "uint",
"ulong", "unchecked", "unsafe", "ushort",
"using", "virtual", "volatile", "void",
"while"
};
private static readonly Regex _validIdentifierRegex = new Regex("^" + IDENTIFIER_OR_KEYWORD + "$", RegexOptions.Compiled);
public static bool IsValidIdentifier(this string identifier)
{
if (String.IsNullOrWhiteSpace(identifier)) return false;
var normalizedIdentifier = identifier.Normalize();
// 1. check that the identifier match the validIdentifer regex and it's not a C# keyword
if (_validIdentifierRegex.IsMatch(normalizedIdentifier) && !_keywords.Contains(normalizedIdentifier))
{
return true;
}
// 2. check if the identifier starts with @
if (normalizedIdentifier.StartsWith("@") && _validIdentifierRegex.IsMatch(normalizedIdentifier.Substring(1)))
{
return true;
}
// 3. it's not a valid identifier
return false;
}
}
@ilyvion
Copy link

ilyvion commented Mar 28, 2013

For performance purposes, it's probably better to move the keyword array and identifier regex strings as well as the Regex itself out as static fields instead of keeping them in-method.

@FabienDehopre
Copy link
Author

I agree with you. So I updated the gist to move all the constants and the regex out of the method.

@vladipus
Copy link

License?

@nuno-andre
Copy link

It would be worthwhile adding contextual keywords to the list. There have been no new reserved keywords from C# 1.0 to avoid breaking older code, but they are reserved in their context.
https://gist.github.com/nuno-andre/ae449bbfa8d5d4c98746050a5d10793a

@FabienDehopre
Copy link
Author

License?

It's free to use.

@SolidAlloy
Copy link

Please replace "unchekeced" with "unchecked" in the keywords array.

@FabienDehopre
Copy link
Author

@SolidAlloy I fixed the typo you reported. Thanks

@bpierson
Copy link

You could probably make this faster by replacing the string[] _keywords with HashSet<string> _keywords. In line 60 where you are using Contains(...), you are "secretly" using LINQ, which is secretly downcasting your string[] to IEnumerable<string>. A good optimization for things that should run fast is to see if you can get rid of using System.Linq at the top of your code.

@bpierson
Copy link

Here is a test that highlights the difference. It can be quite dramatic with larger inputs:

public class LINQ_VS_HashSet
{
	private static readonly string[] _array = new[]
	{
		"abstract",  "event",      "new",        "struct",
		"as",        "explicit",   "null",       "switch",
		"base",      "extern",     "object",     "this",
		"bool",      "false",      "operator",   "throw",
		"break",     "finally",    "out",        "true",
		"byte",      "fixed",      "override",   "try",
		"case",      "float",      "params",     "typeof",
		"catch",     "for",        "private",    "uint",
		"char",      "foreach",    "protected",  "ulong",
		"checked",   "goto",       "public",     "unchecked",
		"class",     "if",         "readonly",   "unsafe",
		"const",     "implicit",   "ref",        "ushort",
		"continue",  "in",         "return",     "using",
		"decimal",   "int",        "sbyte",      "virtual",
		"default",   "interface",  "sealed",     "volatile",
		"delegate",  "internal",   "short",      "void",
		"do",        "is",         "sizeof",     "while",
		"double",    "lock",       "stackalloc",
		"else",      "long",       "static",
		"enum",      "namespace",  "string"
	};

	private static readonly HashSet<string> _hashset = new HashSet<string>
	{
		"abstract",  "event",      "new",        "struct",
		"as",        "explicit",   "null",       "switch",
		"base",      "extern",     "object",     "this",
		"bool",      "false",      "operator",   "throw",
		"break",     "finally",    "out",        "true",
		"byte",      "fixed",      "override",   "try",
		"case",      "float",      "params",     "typeof",
		"catch",     "for",        "private",    "uint",
		"char",      "foreach",    "protected",  "ulong",
		"checked",   "goto",       "public",     "unchecked",
		"class",     "if",         "readonly",   "unsafe",
		"const",     "implicit",   "ref",        "ushort",
		"continue",  "in",         "return",     "using",
		"decimal",   "int",        "sbyte",      "virtual",
		"default",   "interface",  "sealed",     "volatile",
		"delegate",  "internal",   "short",      "void",
		"do",        "is",         "sizeof",     "while",
		"double",    "lock",       "stackalloc",
		"else",      "long",       "static",
		"enum",      "namespace",  "string"
	};

	public static void Test()
	{
		const string tester = "This is a short test string that will show the typeof behavior expected";
		const int iterations = 500000;
		var testArray = tester.Split( ' ' );
		var arrayTimer = new Stopwatch();
		var hashsetTimer = new Stopwatch();
		var arrayMatches = 0;
		var hashSetMatches = 0;
		//
		// Warmup
		//
		for( int i = 0; i < 100; i++ )
		{
			//
			// HashSet
			//
			for( int j = 0; j < testArray.Length; j++ )
			{
				var word = testArray[j];
				_ = _hashset.Contains( word );
			}
			//
			// Array
			//
			for( int j = 0; j < testArray.Length; j++ )
			{
				var word = testArray[j];
				_ = _array.Contains( word );
			}
		}
		//
		// Test
		//
		//
		// Array
		//
		arrayTimer.Start();
		for( int i = 0; i < iterations; i++ )
		{
			for( int j = 0; j < testArray.Length; j++ )
			{
				var word = testArray[j];
				if( _array.Contains( word ) )
				{
					arrayMatches++;
				}
			}
		}
		arrayTimer.Stop();
		//
		// HashSet
		//
		hashsetTimer.Start();
		for( int i = 0; i < iterations; i++ )
		{
			for( int j = 0; j < testArray.Length; j++ )
			{
				var word = testArray[j];
				if( _hashset.Contains( word ) )
				{
					hashSetMatches++;
				}
			}
		}
		hashsetTimer.Stop();

		Console.WriteLine( $"Array: {iterations} iterations, {arrayMatches} matches, {arrayTimer.ElapsedMilliseconds} ms" );
		Console.WriteLine( $"HashSet: {iterations} iterations, {hashSetMatches} matches, {hashsetTimer.ElapsedMilliseconds} ms" );
	}
}

This gives the following results on my machine:

Array: 500000 iterations, 2000000 matches, 1528 ms
HashSet: 500000 iterations, 2000000 matches, 104 ms
Press any key to continue . . .

@FabienDehopre
Copy link
Author

Thanks @bpierson. I've update the gist.
Of course, the list of keywords should also be updated but I've currently not the free time to do it.

@piotrstenke
Copy link

Your hashset is missing four keywords: __arglist, __makeref, __reftype and __refvalue. Though they are not mentioned in the documentation, they exist and are not valid identifiers.

@FabienDehopre
Copy link
Author

Thanks @piotrstenke. I've updated the list of identifiers.
I also took the opportunity to rearrange the list of identifiers and to format the code using the dotnet-format tool.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment