Skip to content

Instantly share code, notes, and snippets.

@flamendless
Forked from CapsAdmin/tokenizer.lua
Created July 31, 2019 09:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flamendless/b46664758bc0b96800ad5d0e137a1302 to your computer and use it in GitHub Desktop.
Save flamendless/b46664758bc0b96800ad5d0e137a1302 to your computer and use it in GitHub Desktop.
-- these have to be sorted by longest first
local keywords = {
"function",
"then",
"end",
"if",
}
local symbols = {
"~=",
"==",
"(",
")",
",",
}
local function is_number(char)
return char:byte() >= 48 and char:byte() <= 71
end
local function is_identifier(char)
return char:byte() >= 65 and char:byte() <= 122 or char == "_"
end
local function is_space(char)
return char == " " or char == "\t" or char == "\r" or char == "\n"
end
function tokenize(code)
local pos = 1
local function advance(num)
pos = pos + num
end
local function read()
local char = code:sub(pos, pos)
advance(1)
return char
end
local function get_range(start, stop)
return code:sub(pos + start, pos + stop)
end
local function get_range_abs(start, stop)
return code:sub(start, stop)
end
local function get_char(offset)
offset = offset or 0
return code:sub(pos + offset, pos + offset)
end
local tokens = {}
for _ = 1, #code do
if get_char() == "" then break end
local found = false
for _, val in ipairs(keywords) do
if get_range(0, #val-1) == val then
table.insert(tokens, {type = "keyword", value = val})
advance(#val)
found = true
break
end
end
for _, val in ipairs(symbols) do
if get_range(0, #val-1) == val then
table.insert(tokens, {type = "symbol", value = val})
advance(#val)
found = true
break
end
end
if not found then
local char = get_char()
if is_identifier(char) then
local start = pos
for _ = 1, #code do
local char = get_char()
if is_identifier(char) then
advance(1)
else
table.insert(tokens, {type = "letter", value = get_range_abs(start, pos-1)})
break
end
end
elseif is_space(char) then
local start = pos
for _ = 1, #code do
local char = get_char()
if is_space(char) then
advance(1)
else
table.insert(tokens, {type = "space", value = get_range_abs(start, pos-1)})
break
end
end
elseif char == "-" and get_range(1, 1) == "-" then
local start = pos
for _ = 1, #code do
local char = get_char()
if char ~= "\n" then
advance(1)
else
table.insert(tokens, {type = "line_comment", value = get_range_abs(start, pos-1)})
break
end
end
elseif char == "\"" then
local start = pos
advance(1) -- skip the first quote
for _ = 1, #code do
local char = get_char()
if char ~= "\"" then
advance(1)
else
table.insert(tokens, {type = "string", value = get_range_abs(start, pos)})
advance(1)
break
end
end
elseif is_number(char) then
local start = pos
advance(1) -- skip the first quote
for _ = 1, #code do
local char = get_char()
if is_number(char) or char == "." then
advance(1)
else
table.insert(tokens, {type = "number", value = get_range_abs(start, pos-1)})
break
end
end
else
return nil, "unxpected character '" .. char .. "' at position " .. pos
end
end
end
return tokens
end
local code = [[
function main(a,b,c)
-- prints hello world to stdout
print("hello world!")
if true ~= false and 1234 then
print("aaa")
end
end
]]
local tokens = assert(tokenize(code))
local new_code = ""
for pos,v in ipairs(tokens) do
print(pos,v.type, v.value)
new_code = new_code .. v.value
end
assert(code == new_code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment