Last active
March 10, 2021 17:03
-
-
Save tannergooding/9149d45cdc0e6d46414153908b671f28 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information. | |
using System; | |
using System.Collections; | |
using System.Collections.Generic; | |
using System.Collections.Immutable; | |
using System.Diagnostics; | |
using System.Globalization; | |
using System.Text; | |
using TerraFX.CodeAnalysis.Source; | |
using TerraFX.CodeAnalysis.Tokens; | |
namespace TerraFX.CodeAnalysis | |
{ | |
/// <summary>Defines a lexical analyzer which tokenizes a source text.</summary> | |
public sealed partial class Lexer : IEnumerator<Token> | |
{ | |
private Token _currentToken; | |
private nuint _sourceTextIndex; | |
private Lexer(SourceText sourceText) | |
{ | |
SourceText = sourceText; | |
Reset(); | |
} | |
/// <inheritdoc /> | |
public Token Current => _currentToken; | |
/// <summary>Gets the source text which is being tokenized.</summary> | |
public SourceText SourceText { get; } | |
/// <summary>Tokenizes the source text.</summary> | |
/// <param name="text">The source text to tokenize.</param> | |
/// <returns>An immutable array of the tokens that comprise <paramref name="text" />.</returns> | |
public static ImmutableArray<Token> Tokenize(string text) | |
{ | |
var sourceText = new SourceText(text); | |
return Tokenize(sourceText); | |
} | |
/// <summary>Tokenizes the source text.</summary> | |
/// <param name="sourceText">The source text to tokenize.</param> | |
/// <returns>An immutable array of the tokens that comprise <paramref name="sourceText" />.</returns> | |
public static ImmutableArray<Token> Tokenize(SourceText sourceText) | |
{ | |
var tokensBuilder = ImmutableArray.CreateBuilder<Token>(); | |
var lexer = new Lexer(sourceText); | |
_ = lexer.MoveNext(); | |
do | |
{ | |
tokensBuilder.Add(lexer.Current); | |
} | |
while (lexer.MoveNext()); | |
return tokensBuilder.ToImmutable(); | |
} | |
/// <inheritdoc /> | |
public unsafe bool MoveNext() | |
{ | |
var sourceTextIndex = _sourceTextIndex; | |
if (sourceTextIndex < SourceText.Length) | |
{ | |
TokenKind tokenKind; | |
nuint sourceSpanLength = 0; | |
var rune = GetRune(sourceTextIndex); | |
delegate*<Rune, bool> isContinuationRune; | |
if (IsIdentifierStartRune(rune)) | |
{ | |
tokenKind = TokenKind.Identifier; | |
isContinuationRune = &IsIdentifierContinuationRune; | |
} | |
else if (IsIntegerRune(rune)) | |
{ | |
tokenKind = TokenKind.Integer; | |
isContinuationRune = &IsIntegerContinuationRune; | |
} | |
else if (IsNewlineRune(rune)) | |
{ | |
tokenKind = TokenKind.Newline; | |
isContinuationRune = &IsNewlineContinuationRune; | |
} | |
else if (IsSyntaxRune(rune)) | |
{ | |
tokenKind = TokenKind.Syntax; | |
isContinuationRune = &IsSyntaxContinuationRune; | |
} | |
else if (IsWhitespaceRune(rune)) | |
{ | |
tokenKind = TokenKind.Whitespace; | |
isContinuationRune = &IsWhitespaceContinuationRune; | |
} | |
else | |
{ | |
tokenKind = TokenKind.Unknown; | |
isContinuationRune = &IsUnknownContinuationRune; | |
} | |
do | |
{ | |
// loop until we no longer have continuation characters | |
sourceSpanLength += 1; | |
sourceTextIndex += 1; | |
rune = GetRune(sourceTextIndex); | |
} | |
while (isContinuationRune(rune)); | |
var sourceSpan = new SourceSpan(SourceText, _sourceTextIndex, sourceSpanLength); | |
_currentToken = new Token(tokenKind, sourceSpan); | |
_sourceTextIndex += sourceSpanLength; | |
return true; | |
} | |
else | |
{ | |
_currentToken = default; | |
return false; | |
} | |
} | |
/// <inheritdoc /> | |
public void Reset() | |
{ | |
_currentToken = default; | |
_sourceTextIndex = 0; | |
} | |
private static bool IsASCII_Digit(Rune rune) | |
=> IsInRangeInclusive(rune, '0', '9'); | |
private static bool IsConnector_Punctuation(UnicodeCategory unicodeCategory) | |
=> unicodeCategory == UnicodeCategory.ConnectorPunctuation; | |
private static bool IsDecimal_Number(UnicodeCategory unicodeCategory) | |
=> unicodeCategory == UnicodeCategory.DecimalDigitNumber; | |
private static bool IsIdentifierContinuationRune(Rune rune) => IsID_Continue(rune); | |
private static bool IsIdentifierStartRune(Rune rune) => IsID_Start(rune); | |
private static bool IsID_Continue(Rune rune) | |
{ | |
var unicodeCategory = Rune.GetUnicodeCategory(rune); | |
// \p{ID_Start} is manually inlined and the order is modified slightly | |
// to allow for a better early exit chance based on how common each | |
// unicode category is expected to be. | |
var result = IsLetter(unicodeCategory) // \p{ID_Start} \p{L} | |
|| IsDecimal_Number(unicodeCategory) // \p{Nd} | |
|| IsLetter_Number(unicodeCategory) // \p{Nl} | |
|| IsNonspacing_Mark(unicodeCategory) // \p{Mn} | |
|| IsSpacing_Mark(unicodeCategory) // \p{Mc} | |
|| IsConnector_Punctuation(unicodeCategory) // \p{Pc} | |
|| IsOther_ID_Start(rune) // \p{Other_ID_Start} | |
|| IsOther_ID_Continue(rune); // \p{Other_ID_Continue} | |
return result | |
&& !IsPattern_Syntax(rune) // -\p{Pattern_Syntax} | |
&& !IsPattern_White_Space(rune); // -\p{Pattern_White_Space} | |
} | |
private static bool IsID_Start(Rune rune) | |
{ | |
var unicodeCategory = Rune.GetUnicodeCategory(rune); | |
return IsID_Start(rune, unicodeCategory); | |
} | |
private static bool IsID_Start(Rune rune, UnicodeCategory unicodeCategory) | |
{ | |
var result = IsLetter(unicodeCategory) // \p{L} | |
|| IsLetter_Number(unicodeCategory) // \p{Nl} | |
|| IsOther_ID_Start(rune); // \p{Other_ID_Start} | |
return result | |
&& !IsPattern_Syntax(rune) // -\p{Pattern_Syntax} | |
&& !IsPattern_White_Space(rune); // -\p{Pattern_White_Space} | |
} | |
private static bool IsInRangeInclusive(UnicodeCategory value, UnicodeCategory lowerBound, UnicodeCategory upperBound) | |
=> IsInRangeInclusive((uint)(value), (uint)(lowerBound), (uint)(upperBound)); | |
private static bool IsInRangeInclusive(Rune value, char lowerBound, char upperBound) | |
=> IsInRangeInclusive((uint)(value.Value), lowerBound, upperBound); | |
private static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound) | |
{ | |
Debug.Assert(lowerBound < upperBound); | |
return unchecked((value - lowerBound) <= (upperBound - lowerBound)); | |
} | |
private static bool IsIntegerRune(Rune rune) => IsASCII_Digit(rune); | |
private static bool IsIntegerContinuationRune(Rune rune) => IsIntegerRune(rune); | |
private static bool IsLetter(UnicodeCategory unicodeCategory) | |
=> IsInRangeInclusive(unicodeCategory, UnicodeCategory.UppercaseLetter, UnicodeCategory.OtherLetter); | |
private static bool IsLetter_Number(UnicodeCategory unicodeCategory) | |
=> unicodeCategory == UnicodeCategory.LetterNumber; | |
private static bool IsNewlineRune(Rune rune) | |
=> IsInRangeInclusive(rune, '\u000A', '\u000D') | |
|| (rune.Value == '\u0085') | |
|| IsInRangeInclusive(rune, '\u2028', '\u2029'); | |
private static bool IsNewlineContinuationRune(Rune rune) => IsNewlineRune(rune); | |
private static bool IsNonspacing_Mark(UnicodeCategory unicodeCategory) | |
=> unicodeCategory == UnicodeCategory.NonSpacingMark; | |
private static bool IsOther_ID_Continue(Rune rune) | |
=> (rune.Value == '\u00B7') | |
|| (rune.Value == '\u0387') | |
|| IsInRangeInclusive(rune, '\u1369', '\u1371') | |
|| (rune.Value == '\u19DA'); | |
private static bool IsOther_ID_Start(Rune rune) | |
=> IsInRangeInclusive(rune, '\u1885', '\u1886') | |
|| (rune.Value == '\u2118') | |
|| (rune.Value == '\u212E') | |
|| IsInRangeInclusive(rune, '\u309B', '\u309C'); | |
private static bool IsPattern_Syntax(Rune rune) | |
=> IsInRangeInclusive(rune, '\u0021', '\u002F') | |
|| IsInRangeInclusive(rune, '\u003A', '\u0040') | |
|| IsInRangeInclusive(rune, '\u005B', '\u005E') | |
|| (rune.Value == '\u0060') | |
|| IsInRangeInclusive(rune, '\u007B', '\u007E') | |
|| IsInRangeInclusive(rune, '\u00A1', '\u00A7') | |
|| (rune.Value == '\u00A9') | |
|| IsInRangeInclusive(rune, '\u00AB', '\u00AC') | |
|| (rune.Value == '\u00AE') | |
|| IsInRangeInclusive(rune, '\u00B0', '\u00B1') | |
|| (rune.Value == '\u00B6') | |
|| (rune.Value == '\u00BB') | |
|| (rune.Value == '\u00BF') | |
|| (rune.Value == '\u00D7') | |
|| (rune.Value == '\u00F7') | |
|| IsInRangeInclusive(rune, '\u2010', '\u2027') | |
|| IsInRangeInclusive(rune, '\u2030', '\u203E') | |
|| IsInRangeInclusive(rune, '\u2041', '\u2053') | |
|| IsInRangeInclusive(rune, '\u2055', '\u205E') | |
|| IsInRangeInclusive(rune, '\u2190', '\u245F') | |
|| IsInRangeInclusive(rune, '\u2500', '\u2775') | |
|| IsInRangeInclusive(rune, '\u2794', '\u2BFF') | |
|| IsInRangeInclusive(rune, '\u2E00', '\u2E7F') | |
|| IsInRangeInclusive(rune, '\u3001', '\u3003') | |
|| IsInRangeInclusive(rune, '\u3008', '\u3020') | |
|| (rune.Value == '\u3030') | |
|| IsInRangeInclusive(rune, '\uFD3E', '\uFD3F') | |
|| IsInRangeInclusive(rune, '\uFE45', '\uFE46'); | |
private static bool IsPattern_White_Space(Rune rune) | |
=> IsInRangeInclusive(rune, '\u0009', '\u000D') | |
|| (rune.Value == '\u0020') | |
|| (rune.Value == '\u0085') | |
|| IsInRangeInclusive(rune, '\u200E', '\u200F') | |
|| IsInRangeInclusive(rune, '\u2028', '\u2029'); | |
private static bool IsSpace_Separator(UnicodeCategory unicodeCategory) | |
=> unicodeCategory == UnicodeCategory.SpaceSeparator; | |
private static bool IsSpacing_Mark(UnicodeCategory unicodeCategory) | |
=> unicodeCategory == UnicodeCategory.SpacingCombiningMark; | |
private static bool IsSyntaxRune(Rune rune) => IsPattern_Syntax(rune); | |
private static bool IsSyntaxContinuationRune(Rune rune) => false; | |
private static bool IsWhitespaceRune(Rune rune) | |
{ | |
var unicodeCategory = Rune.GetUnicodeCategory(rune); | |
// This would normally be something like: | |
// \p{Zs} | |
// \p{Pattern_White_Space} | |
// \p{White_Space} | |
// | |
// However, Pattern_White_Space and White_Space have a lot | |
// of overlap, additionally they include various newline | |
// characters that we don't want included and Zs covers | |
// basically everything else, so we simplify the logic here | |
// instead. | |
return IsSpace_Separator(unicodeCategory) | |
|| (rune.Value == '\u0009') | |
|| IsInRangeInclusive(rune, '\u200E', '\u200F'); | |
} | |
private static bool IsWhitespaceContinuationRune(Rune rune) => IsWhitespaceRune(rune); | |
private static bool IsUnknownContinuationRune(Rune rune) => false; | |
private Rune GetRune(nuint index) => (index < SourceText.Length) ? SourceText[index] : default; | |
void IDisposable.Dispose() { } | |
object IEnumerator.Current => Current; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information. | |
using System; | |
using System.Text; | |
namespace TerraFX.CodeAnalysis.Source | |
{ | |
/// <summary>Defines a span within a source.</summary> | |
public readonly struct SourceSpan | |
{ | |
/// <summary>Initializes a new instance of the <see cref="SourceSpan" /> struct.</summary> | |
/// <param name="sourceText">The source text for the span.</param> | |
/// <param name="start">The start of the span, in runes.</param> | |
/// <param name="length">The length of the span, in runes.</param> | |
public SourceSpan(SourceText sourceText, nuint start, nuint length) | |
{ | |
SourceText = sourceText; | |
Start = start; | |
Length = length; | |
} | |
/// <summary>Gets the length of the span, in runes.</summary> | |
public nuint Length { get; } | |
/// <summary>Gets the source text for the span.</summary> | |
public SourceText SourceText { get; } | |
/// <summary>Gets the start of the span, in runes.</summary> | |
public nuint Start { get; } | |
/// <summary>Gets the rune at the specified index.</summary> | |
/// <param name="index">The index of the rune to get.</param> | |
/// <returns>The rune at the specified index.</returns> | |
public Rune this[nuint index] => SourceText[Start + index]; | |
/// <inheritdoc /> | |
public override string ToString() | |
=> SourceText.Value.AsSpan((int)Start, (int)Length).ToString(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information. | |
using System; | |
using System.Linq; | |
using System.Text; | |
namespace TerraFX.CodeAnalysis.Source | |
{ | |
/// <summary>Defines the text for a source.</summary> | |
public readonly struct SourceText | |
{ | |
private readonly Rune[] _runes; | |
private readonly string _value; | |
/// <summary>Initializes a new instance of the <see cref="SourceText" /> class.</summary> | |
/// <param name="value">The string used to populate the source text.</param> | |
public SourceText(string value) | |
{ | |
value ??= string.Empty; | |
_runes = value.EnumerateRunes().ToArray() ?? Array.Empty<Rune>(); | |
_value = value; | |
} | |
/// <summary>Gets the length of the source text, in runes.</summary> | |
public nuint Length => (nuint)(_runes.Length); | |
/// <summary>The string used to populate the source text.</summary> | |
internal string Value => _value; | |
/// <summary>Gets the rune at the specified index.</summary> | |
/// <param name="index">The index of the rune to get.</param> | |
/// <returns>The rune at the specified index.</returns> | |
public Rune this[nuint index] => _runes[index]; | |
/// <inheritdoc /> | |
public override string ToString() => _value; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information. | |
using System.Text; | |
using TerraFX.CodeAnalysis.Source; | |
namespace TerraFX.CodeAnalysis.Tokens | |
{ | |
/// <summary>Defines a token.</summary> | |
public readonly struct Token | |
{ | |
/// <summary>Initializes a new instance of the <see cref="Token" /> struct.</summary> | |
/// <param name="kind">The kind of the token.</param> | |
/// <param name="sourceSpan">The source span for the token.</param> | |
public Token(TokenKind kind, SourceSpan sourceSpan) | |
{ | |
Kind = kind; | |
SourceSpan = sourceSpan; | |
} | |
/// <summary>Gets <c>true</c> if the token is an identifier; otherwise, <c>false</c>.</summary> | |
public bool IsIdentifier => Kind == TokenKind.Identifier; | |
/// <summary>Gets <c>true</c> if the token is an integer; otherwise, <c>false</c>.</summary> | |
public bool IsInteger => Kind == TokenKind.Integer; | |
/// <summary>Gets <c>true</c> if the token is a newline; otherwise, <c>false</c>.</summary> | |
public bool IsNewline => Kind == TokenKind.Newline; | |
/// <summary>Gets <c>true</c> if the token is syntax; otherwise, <c>false</c>.</summary> | |
public bool IsSyntax => Kind == TokenKind.Syntax; | |
/// <summary>Gets <c>true</c> if the token is whitespace; otherwise, <c>false</c>.</summary> | |
public bool IsWhitespace => Kind == TokenKind.Whitespace; | |
/// <summary>Gets the kind of the token.</summary> | |
public TokenKind Kind { get; } | |
/// <summary>Gets the source span for the token.</summary> | |
public SourceSpan SourceSpan { get; } | |
/// <summary>Gets the rune at the specified index.</summary> | |
/// <param name="index">The index of the rune to get.</param> | |
/// <returns>The rune at the specified index.</returns> | |
public Rune this[nuint index] => SourceSpan[index]; | |
/// <inheritdoc /> | |
public override string ToString() => $"{Kind}: {SourceSpan}"; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright © Tanner Gooding and Contributors. Licensed under the MIT License (MIT). See License.md in the repository root for more information. | |
namespace TerraFX.CodeAnalysis.Tokens | |
{ | |
/// <summary>Defines the kind of a token.</summary> | |
public enum TokenKind : uint | |
{ | |
/// <summary>An unknown token.</summary> | |
Unknown = 0, | |
/// <summary>An identifier token.</summary> | |
Identifier, | |
/// <summary>An integer token.</summary> | |
Integer, | |
/// <summary>A newline token.</summary> | |
Newline, | |
/// <summary>A syntax token.</summary> | |
Syntax, | |
/// <summary>A whitespace token.</summary> | |
Whitespace, | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment