Skip to content

Instantly share code, notes, and snippets.

@AdamSpeight2008
Last active February 14, 2018 00:05
Show Gist options
  • Save AdamSpeight2008/c4e64d0b7bacb1428aa5c1026c50a110 to your computer and use it in GitHub Desktop.
Save AdamSpeight2008/c4e64d0b7bacb1428aa5c1026c50a110 to your computer and use it in GitHub Desktop.
QuickTokenAccumulator rewritten using Enum Flag Operators.
' Copyright (c) Microsoft. All Rights Reserved. Licensed under the Apache License, Version 2.0. See License.txt in the project root for license information.
'-----------------------------------------------------------------------------
' Contains quick token accumulator.
'-----------------------------------------------------------------------------
Option Compare Binary
Option Strict On
Imports System.Text
Imports Microsoft.CodeAnalysis.Text
Imports Microsoft.CodeAnalysis.VisualBasic
Imports Microsoft.CodeAnalysis.VisualBasic.Symbols
Imports Microsoft.CodeAnalysis.VisualBasic.Syntax
Namespace Microsoft.CodeAnalysis.VisualBasic.Syntax.InternalSyntax
'' The QuickTokenAccumulator is a small mini-tokenizer that may fail. It consumes characters and
'' eventually either decides that either it found a complete token (including the trivia on either
'' side), or it gives up and says that the full scanner should take an attempt. It also accumulates the
'' token into a character buffer and computes a hash code. The entire tokenization is done by a single
'' routine without any memory allocations to keep it very fast.
''
'' Currently it only handles two cases:
'' optional-whitespace keyword-or-identifier optional-whitespace
'' optional-whitespace single-char-punctuation optional-whitespace
''
'' where the whitespace does not include newlines.
'
'' The VB tokenization rules are complex, and care needs to be taken in constructing the quick
'' tokenization. For example "REM" begins a comment, so it can't be tokenized as a keyword or identifier.
'' Similar problems arise with multi-character punctuation tokens, which can have embedded spaces.
Partial Friend Class Scanner
''' <summary>
''' The possible states that the mini scanning can be in.
''' </summary>
Private Enum AccumulatorState
Initial
InitialAllowLeadingMultilineTrivia
Ident
TypeChar
FollowingWhite
Punctuation
CompoundPunctStart
CR
Done
Bad
End Enum
' Flags used to classify characters.
<Flags()>
Private Enum CharFlags As UShort
White = 1 << 0 ' simple whitespace (space/tab)
Letter = 1 << 1 ' letter, except for "R" (because of REM) and "_"
IdentOnly = 1 << 2 ' allowed only in identifiers (cannot start one) - letter "R" (because of REM), "_"
TypeChar = 1 << 3 ' legal type character (except !, which is contextually dictionary lookup
Punct = 1 << 4 ' some simple punctuation (parens, braces, dot, comma, equals, question)
CompoundPunctStart = 1 << 5 ' may be a part of compound punctuation. will be used only if followed by (not white) && (not punct)
CR = 1 << 6 ' CR
LF = 1 << 7 ' LF
Digit = 1 << 8 ' digit 0-9
Complex = 1 << 9 ' complex - causes scanning to abort
End Enum
'TODO: why : and ; are complex? (8th row, 3 and 4)
' The following table classifies the first &H180 Unicode characters.
' R and r are marked as COMPLEX so that quick-scanning doesn't stop after "REM".
' # is marked complex as it may start directives.
' < = > are complex because they might start a merge conflict marker.
' PERF: Use UShort instead of CharFlags so the compiler can use array literal initialization.
' The most natural type choice, Enum arrays, are not blittable due to a CLR limitation.
Private Shared ReadOnly s_charProperties As UShort() = {
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.White, CharFlags.LF, CharFlags.Complex, CharFlags.Complex, CharFlags.CR, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, _
_
CharFlags.White, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.TypeChar, CharFlags.TypeChar, CharFlags.TypeChar, CharFlags.Complex,
CharFlags.Punct, CharFlags.Punct, CharFlags.CompoundPunctStart, CharFlags.CompoundPunctStart, CharFlags.Punct, CharFlags.CompoundPunctStart, CharFlags.Punct, CharFlags.CompoundPunctStart,
CharFlags.Digit, CharFlags.Digit, CharFlags.Digit, CharFlags.Digit, CharFlags.Digit, CharFlags.Digit, CharFlags.Digit, CharFlags.Digit,
CharFlags.Digit, CharFlags.Digit, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Punct, _
_
CharFlags.TypeChar, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.IdentOnly, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Complex, CharFlags.CompoundPunctStart, CharFlags.Complex, CharFlags.CompoundPunctStart, CharFlags.IdentOnly, _
_
CharFlags.Complex, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.IdentOnly, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Punct, CharFlags.Complex, CharFlags.Punct, CharFlags.Complex, CharFlags.Complex,
_
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, _
_
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Letter, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Letter, CharFlags.Complex, CharFlags.Complex,
CharFlags.Complex, CharFlags.Complex, CharFlags.Letter, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, CharFlags.Complex, _
_
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Complex,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, _
_
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Complex,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, _
_
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, _
_
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, _
_
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, _
_
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter,
CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter, CharFlags.Letter}
' Size of the above table.
Private Const s_CHARPROP_LENGTH = &H180
' Maximum length of a token to scan
Friend Const MAXTOKENSIZE = 42
Shared Sub New()
Debug.Assert(s_charProperties.Length = s_CHARPROP_LENGTH)
End Sub
Public Structure QuickScanResult
Public Sub New(start As Integer, length As Integer, chars As Char(), hashCode As Integer, terminatorLength As Byte)
Me.Start = start
Me.Length = length
Me.Chars = chars
Me.HashCode = hashCode
Me.TerminatorLength = terminatorLength
End Sub
Public ReadOnly Chars As Char()
Public ReadOnly Start As Integer
Public ReadOnly Length As Integer
Public ReadOnly HashCode As Integer
Public ReadOnly TerminatorLength As Byte
Public ReadOnly Property Succeeded As Boolean
Get
Return Me.Length > 0
End Get
End Property
End Structure
' Attempt to scan a single token.
' If it succeeds, return True, and the characters, length, and hashcode of the token
' can be retrieved by other functions.
' If it fails (the token is too complex), return False.
Public Function QuickScanToken(allowLeadingMultilineTrivia As Boolean) As QuickScanResult
Dim state As AccumulatorState = If(allowLeadingMultilineTrivia, AccumulatorState.InitialAllowLeadingMultilineTrivia, AccumulatorState.Initial)
Dim offset = _lineBufferOffset
Dim page = _curPage
If page Is Nothing OrElse
page._pageStart <> (offset And s_NOT_PAGE_MASK) Then
page = GetPage(offset)
End If
Dim pageArr As Char() = page._arr
Dim qtChars = pageArr
Dim index = _lineBufferOffset And s_PAGE_MASK
Dim qtStart = index
Dim limit = index + Math.Min(MAXTOKENSIZE, _bufferLen - offset)
limit = Math.Min(limit, pageArr.Length)
Dim hashCode As Integer = Hash.FnvOffsetBias
Dim terminatorLength As Byte = 0
Dim unicodeValue As Integer = 0
While index < limit
' Get current character.
Dim c = pageArr(index)
' Get the flags for that character.
unicodeValue = AscW(c)
If unicodeValue >= s_CHARPROP_LENGTH Then
Exit While
End If
Dim flags = s_charProperties(unicodeValue)
' Advance the scanner state.
Select Case state
Case AccumulatorState.InitialAllowLeadingMultilineTrivia
If flags = CharFlags.Letter Then
state = AccumulatorState.Ident
ElseIf flags = CharFlags.Punct Then
state = AccumulatorState.Punctuation
ElseIf flags = CharFlags.CompoundPunctStart Then
state = AccumulatorState.CompoundPunctStart
ElseIf flags!White Or flags!CR Or flags!LF Then
' stay in AccumulatorState.InitialNewStatement
Else
state = AccumulatorState.Bad
Exit While
End If
Case AccumulatorState.Initial
If flags = CharFlags.Letter Then
state = AccumulatorState.Ident
ElseIf flags = CharFlags.Punct Then
state = AccumulatorState.Punctuation
ElseIf flags = CharFlags.CompoundPunctStart Then
state = AccumulatorState.CompoundPunctStart
ElseIf flags = CharFlags.White Then
' stay in AccumulatorState.Initial
Else
state = AccumulatorState.Bad
Exit While
End If
Case AccumulatorState.Ident
If flags!Letter Or flags!IdentOnly Or flags!Digit Then
' stay in Ident
ElseIf flags = CharFlags.White Then
state = AccumulatorState.FollowingWhite
ElseIf flags = CharFlags.CR Then
state = AccumulatorState.CR
ElseIf flags = CharFlags.LF Then
terminatorLength = 1
state = AccumulatorState.Done
Exit While
ElseIf flags = CharFlags.TypeChar Then
state = AccumulatorState.TypeChar
ElseIf flags = CharFlags.Punct Then
state = AccumulatorState.Done
Exit While
Else
state = AccumulatorState.Bad
Exit While
End If
Case AccumulatorState.TypeChar
If flags = CharFlags.White Then
state = AccumulatorState.FollowingWhite
ElseIf flags = CharFlags.CR Then
state = AccumulatorState.CR
ElseIf flags = CharFlags.LF Then
terminatorLength = 1
state = AccumulatorState.Done
Exit While
ElseIf flags!Punct Or flags!Digit Or flags!TypeChar Then
state = AccumulatorState.Done
Exit While
Else
state = AccumulatorState.Bad
Exit While
End If
Case AccumulatorState.FollowingWhite
If flags = CharFlags.White Then
' stay in FollowingWhite
ElseIf flags = CharFlags.CR Then
state = AccumulatorState.CR
ElseIf flags = CharFlags.LF Then
terminatorLength = 1
state = AccumulatorState.Done
Exit While
ElseIf flags!Complex Or flags!IdentOnly Then
state = AccumulatorState.Bad
Exit While
Else
state = AccumulatorState.Done
Exit While
End If
Case AccumulatorState.Punctuation
If flags = CharFlags.White Then
state = AccumulatorState.FollowingWhite
ElseIf flags = CharFlags.CR Then
state = AccumulatorState.CR
ElseIf flags = CharFlags.LF Then
terminatorLength = 1
state = AccumulatorState.Done
Exit While
ElseIf flags!Letter Or flags!Punct Then
state = AccumulatorState.Done
Exit While
Else
state = AccumulatorState.Bad
Exit While
End If
Case AccumulatorState.CompoundPunctStart
If flags = CharFlags.White Then
' stay in CompoundPunctStart
ElseIf flags!Letter Or flags!Digit Then
state = AccumulatorState.Done
Exit While
Else
state = AccumulatorState.Bad
Exit While
End If
Case AccumulatorState.CR
If flags = CharFlags.LF Then
terminatorLength = 2
state = AccumulatorState.Done
Exit While
Else
state = AccumulatorState.Bad
End If
Exit While
Case Else
Debug.Assert(False, "should not get here")
End Select
index += 1
'FNV-like hash should work here
'since these strings are short and mostly ASCII
hashCode = (hashCode Xor unicodeValue) * Hash.FnvPrime
End While
If state = AccumulatorState.Done AndAlso (terminatorLength = 0 OrElse Not Me._IsScanningXmlDoc) Then
If terminatorLength <> 0 Then
index += 1
hashCode = (hashCode Xor unicodeValue) * Hash.FnvPrime
End If
Return New QuickScanResult(qtStart, index - qtStart, qtChars, hashCode, terminatorLength)
Else
Return Nothing
End If
End Function
End Class
End Namespace
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment