Skip to content

Instantly share code, notes, and snippets.

@tannergooding
Last active March 10, 2021 17:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tannergooding/9149d45cdc0e6d46414153908b671f28 to your computer and use it in GitHub Desktop.
Save tannergooding/9149d45cdc0e6d46414153908b671f28 to your computer and use it in GitHub Desktop.
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Globalization;
using System.Text;
using TerraFX.CodeAnalysis.Source;
using TerraFX.CodeAnalysis.Tokens;
namespace TerraFX.CodeAnalysis
{
/// <summary>Defines a lexical analyzer which tokenizes a source text.</summary>
public sealed partial class Lexer : IEnumerator<Token>
{
private Token _currentToken;
private nuint _sourceTextIndex;
private Lexer(SourceText sourceText)
{
SourceText = sourceText;
Reset();
}
/// <inheritdoc />
public Token Current => _currentToken;
/// <summary>Gets the source text which is being tokenized.</summary>
public SourceText SourceText { get; }
/// <summary>Tokenizes the source text.</summary>
/// <param name="text">The source text to tokenize.</param>
/// <returns>An immutable array of the tokens that comprise <paramref name="text" />.</returns>
public static ImmutableArray<Token> Tokenize(string text)
{
var sourceText = new SourceText(text);
return Tokenize(sourceText);
}
/// <summary>Tokenizes the source text.</summary>
/// <param name="sourceText">The source text to tokenize.</param>
/// <returns>An immutable array of the tokens that comprise <paramref name="sourceText" />.</returns>
public static ImmutableArray<Token> Tokenize(SourceText sourceText)
{
var tokensBuilder = ImmutableArray.CreateBuilder<Token>();
var lexer = new Lexer(sourceText);
_ = lexer.MoveNext();
do
{
tokensBuilder.Add(lexer.Current);
}
while (lexer.MoveNext());
return tokensBuilder.ToImmutable();
}
/// <inheritdoc />
public unsafe bool MoveNext()
{
var sourceTextIndex = _sourceTextIndex;
if (sourceTextIndex < SourceText.Length)
{
TokenKind tokenKind;
nuint sourceSpanLength = 0;
var rune = GetRune(sourceTextIndex);
delegate*<Rune, bool> isContinuationRune;
if (IsIdentifierStartRune(rune))
{
tokenKind = TokenKind.Identifier;
isContinuationRune = &IsIdentifierContinuationRune;
}
else if (IsIntegerRune(rune))
{
tokenKind = TokenKind.Integer;
isContinuationRune = &IsIntegerContinuationRune;
}
else if (IsNewlineRune(rune))
{
tokenKind = TokenKind.Newline;
isContinuationRune = &IsNewlineContinuationRune;
}
else if (IsSyntaxRune(rune))
{
tokenKind = TokenKind.Syntax;
isContinuationRune = &IsSyntaxContinuationRune;
}
else if (IsWhitespaceRune(rune))
{
tokenKind = TokenKind.Whitespace;
isContinuationRune = &IsWhitespaceContinuationRune;
}
else
{
tokenKind = TokenKind.Unknown;
isContinuationRune = &IsUnknownContinuationRune;
}
do
{
// loop until we no longer have continuation characters
sourceSpanLength += 1;
sourceTextIndex += 1;
rune = GetRune(sourceTextIndex);
}
while (isContinuationRune(rune));
var sourceSpan = new SourceSpan(SourceText, _sourceTextIndex, sourceSpanLength);
_currentToken = new Token(tokenKind, sourceSpan);
_sourceTextIndex += sourceSpanLength;
return true;
}
else
{
_currentToken = default;
return false;
}
}
/// <inheritdoc />
public void Reset()
{
_currentToken = default;
_sourceTextIndex = 0;
}
private static bool IsASCII_Digit(Rune rune)
=> IsInRangeInclusive(rune, '0', '9');
private static bool IsConnector_Punctuation(UnicodeCategory unicodeCategory)
=> unicodeCategory == UnicodeCategory.ConnectorPunctuation;
private static bool IsDecimal_Number(UnicodeCategory unicodeCategory)
=> unicodeCategory == UnicodeCategory.DecimalDigitNumber;
private static bool IsIdentifierContinuationRune(Rune rune) => IsID_Continue(rune);
private static bool IsIdentifierStartRune(Rune rune) => IsID_Start(rune);
private static bool IsID_Continue(Rune rune)
{
var unicodeCategory = Rune.GetUnicodeCategory(rune);
// \p{ID_Start} is manually inlined and the order is modified slightly
// to allow for a better early exit chance based on how common each
// unicode category is expected to be.
var result = IsLetter(unicodeCategory) // \p{ID_Start} \p{L}
|| IsDecimal_Number(unicodeCategory) // \p{Nd}
|| IsLetter_Number(unicodeCategory) // \p{Nl}
|| IsNonspacing_Mark(unicodeCategory) // \p{Mn}
|| IsSpacing_Mark(unicodeCategory) // \p{Mc}
|| IsConnector_Punctuation(unicodeCategory) // \p{Pc}
|| IsOther_ID_Start(rune) // \p{Other_ID_Start}
|| IsOther_ID_Continue(rune); // \p{Other_ID_Continue}
return result
&& !IsPattern_Syntax(rune) // -\p{Pattern_Syntax}
&& !IsPattern_White_Space(rune); // -\p{Pattern_White_Space}
}
private static bool IsID_Start(Rune rune)
{
var unicodeCategory = Rune.GetUnicodeCategory(rune);
return IsID_Start(rune, unicodeCategory);
}
private static bool IsID_Start(Rune rune, UnicodeCategory unicodeCategory)
{
var result = IsLetter(unicodeCategory) // \p{L}
|| IsLetter_Number(unicodeCategory) // \p{Nl}
|| IsOther_ID_Start(rune); // \p{Other_ID_Start}
return result
&& !IsPattern_Syntax(rune) // -\p{Pattern_Syntax}
&& !IsPattern_White_Space(rune); // -\p{Pattern_White_Space}
}
private static bool IsInRangeInclusive(UnicodeCategory value, UnicodeCategory lowerBound, UnicodeCategory upperBound)
=> IsInRangeInclusive((uint)(value), (uint)(lowerBound), (uint)(upperBound));
private static bool IsInRangeInclusive(Rune value, char lowerBound, char upperBound)
=> IsInRangeInclusive((uint)(value.Value), lowerBound, upperBound);
private static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound)
{
Debug.Assert(lowerBound < upperBound);
return unchecked((value - lowerBound) <= (upperBound - lowerBound));
}
private static bool IsIntegerRune(Rune rune) => IsASCII_Digit(rune);
private static bool IsIntegerContinuationRune(Rune rune) => IsIntegerRune(rune);
private static bool IsLetter(UnicodeCategory unicodeCategory)
=> IsInRangeInclusive(unicodeCategory, UnicodeCategory.UppercaseLetter, UnicodeCategory.OtherLetter);
private static bool IsLetter_Number(UnicodeCategory unicodeCategory)
=> unicodeCategory == UnicodeCategory.LetterNumber;
private static bool IsNewlineRune(Rune rune)
=> IsInRangeInclusive(rune, '\u000A', '\u000D')
|| (rune.Value == '\u0085')
|| IsInRangeInclusive(rune, '\u2028', '\u2029');
private static bool IsNewlineContinuationRune(Rune rune) => IsNewlineRune(rune);
private static bool IsNonspacing_Mark(UnicodeCategory unicodeCategory)
=> unicodeCategory == UnicodeCategory.NonSpacingMark;
private static bool IsOther_ID_Continue(Rune rune)
=> (rune.Value == '\u00B7')
|| (rune.Value == '\u0387')
|| IsInRangeInclusive(rune, '\u1369', '\u1371')
|| (rune.Value == '\u19DA');
private static bool IsOther_ID_Start(Rune rune)
=> IsInRangeInclusive(rune, '\u1885', '\u1886')
|| (rune.Value == '\u2118')
|| (rune.Value == '\u212E')
|| IsInRangeInclusive(rune, '\u309B', '\u309C');
private static bool IsPattern_Syntax(Rune rune)
=> IsInRangeInclusive(rune, '\u0021', '\u002F')
|| IsInRangeInclusive(rune, '\u003A', '\u0040')
|| IsInRangeInclusive(rune, '\u005B', '\u005E')
|| (rune.Value == '\u0060')
|| IsInRangeInclusive(rune, '\u007B', '\u007E')
|| IsInRangeInclusive(rune, '\u00A1', '\u00A7')
|| (rune.Value == '\u00A9')
|| IsInRangeInclusive(rune, '\u00AB', '\u00AC')
|| (rune.Value == '\u00AE')
|| IsInRangeInclusive(rune, '\u00B0', '\u00B1')
|| (rune.Value == '\u00B6')
|| (rune.Value == '\u00BB')
|| (rune.Value == '\u00BF')
|| (rune.Value == '\u00D7')
|| (rune.Value == '\u00F7')
|| IsInRangeInclusive(rune, '\u2010', '\u2027')
|| IsInRangeInclusive(rune, '\u2030', '\u203E')
|| IsInRangeInclusive(rune, '\u2041', '\u2053')
|| IsInRangeInclusive(rune, '\u2055', '\u205E')
|| IsInRangeInclusive(rune, '\u2190', '\u245F')
|| IsInRangeInclusive(rune, '\u2500', '\u2775')
|| IsInRangeInclusive(rune, '\u2794', '\u2BFF')
|| IsInRangeInclusive(rune, '\u2E00', '\u2E7F')
|| IsInRangeInclusive(rune, '\u3001', '\u3003')
|| IsInRangeInclusive(rune, '\u3008', '\u3020')
|| (rune.Value == '\u3030')
|| IsInRangeInclusive(rune, '\uFD3E', '\uFD3F')
|| IsInRangeInclusive(rune, '\uFE45', '\uFE46');
private static bool IsPattern_White_Space(Rune rune)
=> IsInRangeInclusive(rune, '\u0009', '\u000D')
|| (rune.Value == '\u0020')
|| (rune.Value == '\u0085')
|| IsInRangeInclusive(rune, '\u200E', '\u200F')
|| IsInRangeInclusive(rune, '\u2028', '\u2029');
private static bool IsSpace_Separator(UnicodeCategory unicodeCategory)
=> unicodeCategory == UnicodeCategory.SpaceSeparator;
private static bool IsSpacing_Mark(UnicodeCategory unicodeCategory)
=> unicodeCategory == UnicodeCategory.SpacingCombiningMark;
private static bool IsSyntaxRune(Rune rune) => IsPattern_Syntax(rune);
private static bool IsSyntaxContinuationRune(Rune rune) => false;
private static bool IsWhitespaceRune(Rune rune)
{
var unicodeCategory = Rune.GetUnicodeCategory(rune);
// This would normally be something like:
// \p{Zs}
// \p{Pattern_White_Space}
// \p{White_Space}
//
// However, Pattern_White_Space and White_Space have a lot
// of overlap, additionally they include various newline
// characters that we don't want included and Zs covers
// basically everything else, so we simplify the logic here
// instead.
return IsSpace_Separator(unicodeCategory)
|| (rune.Value == '\u0009')
|| IsInRangeInclusive(rune, '\u200E', '\u200F');
}
private static bool IsWhitespaceContinuationRune(Rune rune) => IsWhitespaceRune(rune);
private static bool IsUnknownContinuationRune(Rune rune) => false;
private Rune GetRune(nuint index) => (index < SourceText.Length) ? SourceText[index] : default;
void IDisposable.Dispose() { }
object IEnumerator.Current => Current;
}
}
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information.
using System;
using System.Text;
namespace TerraFX.CodeAnalysis.Source
{
/// <summary>Defines a span within a source.</summary>
public readonly struct SourceSpan
{
/// <summary>Initializes a new instance of the <see cref="SourceSpan" /> struct.</summary>
/// <param name="sourceText">The source text for the span.</param>
/// <param name="start">The start of the span, in runes.</param>
/// <param name="length">The length of the span, in runes.</param>
public SourceSpan(SourceText sourceText, nuint start, nuint length)
{
SourceText = sourceText;
Start = start;
Length = length;
}
/// <summary>Gets the length of the span, in runes.</summary>
public nuint Length { get; }
/// <summary>Gets the source text for the span.</summary>
public SourceText SourceText { get; }
/// <summary>Gets the start of the span, in runes.</summary>
public nuint Start { get; }
/// <summary>Gets the rune at the specified index.</summary>
/// <param name="index">The index of the rune to get.</param>
/// <returns>The rune at the specified index.</returns>
public Rune this[nuint index] => SourceText[Start + index];
/// <inheritdoc />
public override string ToString()
=> SourceText.Value.AsSpan((int)Start, (int)Length).ToString();
}
}
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information.
using System;
using System.Linq;
using System.Text;
namespace TerraFX.CodeAnalysis.Source
{
/// <summary>Defines the text for a source.</summary>
public readonly struct SourceText
{
private readonly Rune[] _runes;
private readonly string _value;
/// <summary>Initializes a new instance of the <see cref="SourceText" /> class.</summary>
/// <param name="value">The string used to populate the source text.</param>
public SourceText(string value)
{
value ??= string.Empty;
_runes = value.EnumerateRunes().ToArray() ?? Array.Empty<Rune>();
_value = value;
}
/// <summary>Gets the length of the source text, in runes.</summary>
public nuint Length => (nuint)(_runes.Length);
/// <summary>The string used to populate the source text.</summary>
internal string Value => _value;
/// <summary>Gets the rune at the specified index.</summary>
/// <param name="index">The index of the rune to get.</param>
/// <returns>The rune at the specified index.</returns>
public Rune this[nuint index] => _runes[index];
/// <inheritdoc />
public override string ToString() => _value;
}
}
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information.
using System.Text;
using TerraFX.CodeAnalysis.Source;
namespace TerraFX.CodeAnalysis.Tokens
{
/// <summary>Defines a token.</summary>
public readonly struct Token
{
/// <summary>Initializes a new instance of the <see cref="Token" /> struct.</summary>
/// <param name="kind">The kind of the token.</param>
/// <param name="sourceSpan">The source span for the token.</param>
public Token(TokenKind kind, SourceSpan sourceSpan)
{
Kind = kind;
SourceSpan = sourceSpan;
}
/// <summary>Gets <c>true</c> if the token is an identifier; otherwise, <c>false</c>.</summary>
public bool IsIdentifier => Kind == TokenKind.Identifier;
/// <summary>Gets <c>true</c> if the token is an integer; otherwise, <c>false</c>.</summary>
public bool IsInteger => Kind == TokenKind.Integer;
/// <summary>Gets <c>true</c> if the token is a newline; otherwise, <c>false</c>.</summary>
public bool IsNewline => Kind == TokenKind.Newline;
/// <summary>Gets <c>true</c> if the token is syntax; otherwise, <c>false</c>.</summary>
public bool IsSyntax => Kind == TokenKind.Syntax;
/// <summary>Gets <c>true</c> if the token is whitespace; otherwise, <c>false</c>.</summary>
public bool IsWhitespace => Kind == TokenKind.Whitespace;
/// <summary>Gets the kind of the token.</summary>
public TokenKind Kind { get; }
/// <summary>Gets the source span for the token.</summary>
public SourceSpan SourceSpan { get; }
/// <summary>Gets the rune at the specified index.</summary>
/// <param name="index">The index of the rune to get.</param>
/// <returns>The rune at the specified index.</returns>
public Rune this[nuint index] => SourceSpan[index];
/// <inheritdoc />
public override string ToString() => $"{Kind}: {SourceSpan}";
}
}
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information.
namespace TerraFX.CodeAnalysis.Tokens
{
/// <summary>Defines the kind of a token.</summary>
public enum TokenKind : uint
{
/// <summary>An unknown token.</summary>
Unknown = 0,
/// <summary>An identifier token.</summary>
Identifier,
/// <summary>An integer token.</summary>
Integer,
/// <summary>A newline token.</summary>
Newline,
/// <summary>A syntax token.</summary>
Syntax,
/// <summary>A whitespace token.</summary>
Whitespace,
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment