Last active
September 27, 2023 16:00
-
-
Save tenderlove/e9bb912648a3d2bce00c4f60bc632a10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "strscan" | |
class Lexer | |
IDENTIFIER = /[_A-Za-z][_0-9A-Za-z]*\b/ | |
IGNORE = %r{ | |
(?: | |
[, \c\r\n\t]+ | | |
\#.*$ | |
)* | |
}x | |
INT = /[-]?(?:[0]|[1-9][0-9]*)/ | |
FLOAT_DECIMAL = /[.][0-9]+/ | |
FLOAT_EXP = /[eE][+-]?[0-9]+/ | |
FLOAT = /#{INT}#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP}/ | |
KEYWORDS = [ "on", "fragment", "true", "false", "null", "query", "mutation", | |
"subscription", "schema", "scalar", "type", "extend", "implements", | |
"interface", "union", "enum", "input", "directive", "repeatable" | |
].freeze | |
KW_RE = /#{Regexp.union(KEYWORDS.sort)}\b/ | |
module Literals | |
LCURLY = '{' | |
RCURLY = '}' | |
LPAREN = '(' | |
RPAREN = ')' | |
LBRACKET = '[' | |
RBRACKET = ']' | |
COLON = ':' | |
VAR_SIGN = '$' | |
DIR_SIGN = '@' | |
EQUALS = '=' | |
BANG = '!' | |
PIPE = '|' | |
AMP = '&' | |
end | |
ELLIPSIS = '...' | |
PUNCTUATION_TABLE = Literals.constants.each_with_object([]) { |n, o| | |
o[Literals.const_get(n).ord] = n | |
} | |
def initialize doc | |
@doc = doc | |
@scan = StringScanner.new doc | |
end | |
def next_token | |
@scan.skip(IGNORE) | |
return if @scan.eos? | |
case | |
when tok = PUNCTUATION_TABLE[@doc.getbyte(@scan.pos)] then | |
# If the byte at the current position is inside our lookup table, push | |
# the scanner position forward 1 and return the token | |
@scan.pos += 1 | |
tok | |
when len = @scan.skip(KW_RE) then | |
# Return early if uniquely identifiable via length | |
return :ON if len == 2 | |
return :SUBSCRIPTION if len == 12 | |
# Get the position of the start of the keyword | |
start = @scan.pos - len | |
# Get the 2nd and 3rd byte of the keyword and combine to a 16 bit int | |
key = (@doc.getbyte(start + 2) << 8) | @doc.getbyte(start + 1) | |
KW_TABLE[_hash(key)] | |
when @scan.skip(IDENTIFIER) then :IDENTIFIER | |
when @scan.skip(ELLIPSIS) then :ELLIPSIS | |
when @scan.skip(FLOAT) then :FLOAT | |
when @scan.skip(INT) then :INT | |
else | |
@scan.getch | |
:UNKNOWN_CHAR | |
end | |
end | |
KW_TABLE = [:INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, | |
nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, | |
:IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, | |
:UNION, nil, nil, :SCALAR, nil, :FRAGMENT] | |
def _hash key | |
(key * 18592990) >> 27 & 0x1f | |
end | |
end | |
require "benchmark/ips" | |
def allocations | |
x = GC.stat(:total_allocated_objects) | |
yield | |
GC.stat(:total_allocated_objects) - x | |
end | |
def go doc | |
lexer = Lexer.new doc | |
while tok = lexer.next_token | |
# do nothing | |
end | |
end | |
if $0 == __FILE__ | |
doc = ARGF.read | |
Benchmark.ips { |x| x.report { go doc } } | |
p ALLOCATIONS: allocations { go doc } | |
p ALLOCATIONS: allocations { go doc } | |
end |
@marktriggs nice! Thanks for the tip. Do you mind trying it on the upstream lexer? Unfortunately I changed it around a bunch, so I'm not sure if the optimization will apply. The changes I've made upstream are pretty similar to this optimization, but it's great to document them here for future internet travelers.
@tenderlove Yep! I think you've got the spiritual equivalent in the upstream version 👍
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @tenderlove,
Great post, thanks! Messing around with this test, I see a respectable performance boost by helping the KW_RE short-circuit in more cases:
(stealth edited: now using a zero-width pattern to make sure we don't accept weird keywords like "frue" and "talse". The old "I made this faster by introducing bugs" :P)