Last active
September 27, 2023 16:00
-
-
Save tenderlove/e9bb912648a3d2bce00c4f60bc632a10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "strscan" | |
class Lexer | |
IDENTIFIER = /[_A-Za-z][_0-9A-Za-z]*\b/ | |
IGNORE = %r{ | |
(?: | |
[, \c\r\n\t]+ | | |
\#.*$ | |
)* | |
}x | |
INT = /[-]?(?:[0]|[1-9][0-9]*)/ | |
FLOAT_DECIMAL = /[.][0-9]+/ | |
FLOAT_EXP = /[eE][+-]?[0-9]+/ | |
FLOAT = /#{INT}#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP}/ | |
KEYWORDS = [ "on", "fragment", "true", "false", "null", "query", "mutation", | |
"subscription", "schema", "scalar", "type", "extend", "implements", | |
"interface", "union", "enum", "input", "directive", "repeatable" | |
].freeze | |
KW_RE = /#{Regexp.union(KEYWORDS.sort)}\b/ | |
module Literals | |
LCURLY = '{' | |
RCURLY = '}' | |
LPAREN = '(' | |
RPAREN = ')' | |
LBRACKET = '[' | |
RBRACKET = ']' | |
COLON = ':' | |
VAR_SIGN = '$' | |
DIR_SIGN = '@' | |
EQUALS = '=' | |
BANG = '!' | |
PIPE = '|' | |
AMP = '&' | |
end | |
ELLIPSIS = '...' | |
PUNCTUATION_TABLE = Literals.constants.each_with_object([]) { |n, o| | |
o[Literals.const_get(n).ord] = n | |
} | |
def initialize doc | |
@doc = doc | |
@scan = StringScanner.new doc | |
end | |
def next_token | |
@scan.skip(IGNORE) | |
return if @scan.eos? | |
case | |
when tok = PUNCTUATION_TABLE[@doc.getbyte(@scan.pos)] then | |
# If the byte at the current position is inside our lookup table, push | |
# the scanner position forward 1 and return the token | |
@scan.pos += 1 | |
tok | |
when len = @scan.skip(KW_RE) then | |
# Return early if uniquely identifiable via length | |
return :ON if len == 2 | |
return :SUBSCRIPTION if len == 12 | |
# Get the position of the start of the keyword | |
start = @scan.pos - len | |
# Get the 2nd and 3rd byte of the keyword and combine to a 16 bit int | |
key = (@doc.getbyte(start + 2) << 8) | @doc.getbyte(start + 1) | |
KW_TABLE[_hash(key)] | |
when @scan.skip(IDENTIFIER) then :IDENTIFIER | |
when @scan.skip(ELLIPSIS) then :ELLIPSIS | |
when @scan.skip(FLOAT) then :FLOAT | |
when @scan.skip(INT) then :INT | |
else | |
@scan.getch | |
:UNKNOWN_CHAR | |
end | |
end | |
KW_TABLE = [:INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, | |
nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, | |
:IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, | |
:UNION, nil, nil, :SCALAR, nil, :FRAGMENT] | |
def _hash key | |
(key * 18592990) >> 27 & 0x1f | |
end | |
end | |
require "benchmark/ips" | |
def allocations | |
x = GC.stat(:total_allocated_objects) | |
yield | |
GC.stat(:total_allocated_objects) - x | |
end | |
def go doc | |
lexer = Lexer.new doc | |
while tok = lexer.next_token | |
# do nothing | |
end | |
end | |
if $0 == __FILE__ | |
doc = ARGF.read | |
Benchmark.ips { |x| x.report { go doc } } | |
p ALLOCATIONS: allocations { go doc } | |
p ALLOCATIONS: allocations { go doc } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@marktriggs nice! Thanks for the tip. Do you mind trying it on the upstream lexer? Unfortunately I changed it around a bunch, so I'm not sure if the optimization will apply. The changes I've made upstream are pretty similar to this optimization, but it's great to document them here for future internet travelers.