Skip to content

Instantly share code, notes, and snippets.

@ZacLN
Created February 9, 2017 20:32
Show Gist options
  • Save ZacLN/977ce3e0be7d07f51ac45f538cf6c777 to your computer and use it in GitHub Desktop.
Save ZacLN/977ce3e0be7d07f51ac45f538cf6c777 to your computer and use it in GitHub Desktop.
allocationless lexing
using Tokenize
import Tokenize.Lexers:peekchar, readchar, is_identifier_char, accept_batch, eof
import Tokenize.Tokens: FUNCTION, ABSTRACT, IDENTIFIER, BAREMODULE, BEGIN, BITSTYPE, BREAK, CATCH, CONST, CONTINUE, DO, ELSE, ELSEIF, END, EXPORT, FALSE, FINALLY, FOR, FUNCTION, GLOBAL, LET, LOCAL, IF, IMMUTABLE, IMPORT, IMPORTALL, MACRO, MODULE, QUOTE, RETURN, TRUE, TRY, TYPE, TYPEALIAS, USING, WHILE
function tryread(l, str, k, start_loc)
for s in str
c = readchar(l)
if c!=s
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
end
accept_batch(l, is_identifier_char)
return start_loc, position(l.io), IDENTIFIER
end
end
if is_identifier_char(peekchar(l))
accept_batch(l, is_identifier_char)
return start_loc, position(l.io), IDENTIFIER
end
return start_loc, position(l.io), k
end
function readrest(l, start_loc)
accept_batch(l, is_identifier_char)
return start_loc, position(l.io), IDENTIFIER
end
function lex_identifier(l, c)
start_loc = position(l.io)-1
if c == 'a'
return tryread(l, ('b', 's', 't', 'r', 'a', 'c', 't'), ABSTRACT, start_loc)
elseif c == 'b'
c = readchar(l)
if c == 'a'
return tryread(l, ('r', 'e', 'm', 'o', 'd', 'u', 'l', 'e'), BAREMODULE, start_loc)
elseif c == 'e'
return tryread(l, ('g', 'i', 'n'), BEGIN, start_loc)
elseif c == 'i'
return tryread(l, ('t', 's', 't', 'y', 'p', 'e'), BITSTYPE, start_loc)
elseif c == 'r'
return tryread(l, ('e', 'a', 'k'), BREAK, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'c'
c = readchar(l)
if c == 'a'
return tryread(l, ('t', 'c', 'h'), CATCH, start_loc)
elseif c == 'o'
c = readchar(l)
if c == 'n'
c = readchar(l)
if c == 's'
return tryread(l, ('t',), CONST, start_loc)
elseif c == 't'
return tryread(l, ('i', 'n', 'u', 'e'), CONTINUE, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'd'
return tryread(l, ('o'), DO, start_loc)
elseif c == 'e'
c = readchar(l)
if c == 'l'
c = readchar(l)
if c == 's'
c = readchar(l)
if c == 'e'
c = readchar(l)
if !is_identifier_char(c)
return start_loc, position(l.io), ELSE
elseif c == "i"
return tryread(l, ('f'), ELSEIF, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'n'
return tryread(l, ('d'), END, start_loc)
elseif c == 'x'
return tryread(l, ('p', 'o', 'r', 't'), EXPORT, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'f'
c = readchar(l)
if c == 'a'
return tryread(l, ('l', 's', 'e'), FALSE, start_loc)
elseif c == 'i'
return tryread(l, ('n', 'a', 'l', 'l', 'y'), FINALLY, start_loc)
elseif c == 'o'
return tryread(l, ('r'), FOR, start_loc)
elseif c == 'u'
return tryread(l, ('n', 'c', 't', 'i', 'o', 'n'), FUNCTION, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'g'
return tryread(l, ('l', 'o', 'b', 'a', 'l'), GLOBAL, start_loc)
elseif c == 'i'
c = readchar(l)
if c == 'f'
c = readchar(l)
if !is_identifier_char(c)
skip(l.io, -Int(!eof(c)))
return start_loc, position(l.io), IF
else
return readrest(l, start_loc)
end
elseif c == 'm'
c = readchar(l)
if c == 'm'
return tryread(l, ('u', 't', 'a', 'b', 'l', 'e'), IMMUTABLE, start_loc)
elseif c == 'p'
c = readchar(l)
if c == 'o'
c = readchar(l)
if c == 'r'
c = readchar(l)
if c == 't'
c = readchar(l)
if !is_identifier_char(c)
skip(l.io, -Int(!eof(c)))
return start_loc, position(l.io), IMPORT
elseif c == 'a'
return tryread(l, ('l','l'), IMPORTALL, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'l'
c = readchar(l)
if c == 'e'
return tryread(l, ('t'), LET, start_loc)
elseif c == 'o'
return tryread(l, ('c', 'a', 'l'), LOCAL, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'm'
c = readchar(l)
if c == 'a'
return tryread(l, ('c', 'r', 'o'), MACRO, start_loc)
elseif c == 'o'
return tryread(l, ('d', 'u', 'l', 'e'), MODULE, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'q'
return tryread(l, ('u', 'o', 't', 'e'), QUOTE, start_loc)
elseif c == 'r'
return tryread(l, ('e', 't', 'u', 'r', 'n'), RETURN, start_loc)
elseif c == 't'
c = readchar(l)
if c == 'r'
c = readchar(l)
if c == 'u'
return tryread(l, ('e'), TRUE, start_loc)
elseif c == 'y'
return start_loc, position(l.io), TRY
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'y'
c = readchar(l)
if c == 'p'
c = readchar(l)
if c == 'e'
c = readchar(l)
if !is_identifier_char(c)
return start_loc, position(l.io), TYPE
elseif c == 'a'
return tryread(l, ('l', 'i', 'a', 's'), TYPEALIAS, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
elseif c == 'u'
return tryread(l, ('s', 'i', 'n', 'g'), USING, start_loc)
elseif c == 'w'
return tryread(l, ('h', 'i', 'l', 'e'), WHILE, start_loc)
else
if !is_identifier_char(c)
skip(l.io, -Int(!eof(l.io)))
return start_loc, position(l.io), IDENTIFIER
else
return readrest(l, start_loc)
end
end
end
l = Tokenize.Lexers.Lexer(IOBuffer("export"))
c = readchar(l)
lex_identifier(l, c)
for s in keys(Tokens.KEYWORDS)
l = Tokenize.Lexers.Lexer(IOBuffer(s))
c = readchar(l)
print(s)
println(" ", lex_identifier(l, c)[3])
end
b0 = @benchmark begin l = Tokenize.Lexers.Lexer(IOBuffer("baremodule"))
c = readchar(l)
lex_identifier(l, c) end
b1 = @benchmark begin l = Tokenize.Lexers.Lexer(IOBuffer("baremodule"))
c = readchar(l)
Tokenize.Lexers.lex_identifier(l) end
b01 = @benchmark begin
for s in keys(Tokens.KEYWORDS)
l = Tokenize.Lexers.Lexer(IOBuffer(s))
c = readchar(l)
lex_identifier(l, c)
end
end
b11 = @benchmark begin
for s in keys(Tokens.KEYWORDS)
l = Tokenize.Lexers.Lexer(IOBuffer(s))
c = readchar(l)
Tokenize.Lexers.lex_identifier(l)
end
end
b = @benchmark begin
for s in keys(Tokens.KEYWORDS)
l = Tokenize.Lexers.Lexer(IOBuffer(s))
c = readchar(l)
# Tokenize.Lexers.lex_identifier(l)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment