Last active
April 11, 2022 00:43
-
-
Save VideoCarp/36bca88c6ef61f0a0d993d70ef6df760 to your computer and use it in GitHub Desktop.
A tiny lexer in Elixir.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A small, unoptimized lexer in the Elixir programming language. | |
# No mutable values are used. | |
# This lexer was much, much easier for me to write and debug than when I had done it | |
# using imperative programming concepts such as loops and mutable values. | |
# This can be optimized by consuming the input string, instead of indexing. | |
defmodule Lexer do | |
# char info | |
def numeric(char) do | |
char >= "0" and char <= "9" | |
end | |
def alphanumeric(char) do | |
numeric(char) || (char >= "a" && char <= "z") || (char >= "A" && char <= "Z") || char == "_" | |
end | |
def arithmetic(char) do | |
Enum.member?(["+", "-", "*", "/", "%"], char) | |
end | |
# Handlers | |
def handlenum(cursor, temp, input_strr) do | |
character = String.at(input_strr, cursor) | |
if numeric(character) do | |
handlenum(cursor + 1, temp <> character, input_strr) | |
else | |
{temp, cursor} | |
end | |
end | |
def handlestring(cursor, temp, input_strr) do | |
character = String.at(input_strr, cursor) | |
if character != "\"" do | |
handlestring(cursor + 1, temp <> character, input_strr) | |
else | |
{temp, cursor} | |
end | |
end | |
def handlealpha(cursor, temp, input_strr) do | |
character = String.at(input_strr, cursor) | |
if alphanumeric(character) do | |
handlealpha(cursor + 1, temp <> character, input_strr) | |
else | |
{temp, cursor} | |
end | |
end | |
# lex int, list, int, string -> [{string, atom}] | |
def lex(current \\ 0, tokenstream \\ [], len, input_str) do | |
char = String.at(input_str, current) | |
unless current >= len do | |
cond do | |
# single-char tokens | |
char == ")" -> | |
lex(current + 1, [{char, :cparen} | tokenstream], len, input_str) | |
char == "(" -> | |
lex(current + 1, [{char, :oparen} | tokenstream], len, input_str) | |
char == "{" -> | |
lex(current + 1, [{char, :obrace} | tokenstream], len, input_str) | |
char == "}" -> | |
lex(current + 1, [{char, :cbrace} | tokenstream], len, input_str) | |
char == "=" -> | |
lex(current + 1, [{char, :assignment} | tokenstream], len, input_str) | |
# Tokens requiring repetition | |
char == "\"" -> | |
{token, cursor} = handlestring(current + 1, "", input_str) | |
# Increments to manage "" existing but not needed. | |
lex(cursor + 1, [{token, :str} | tokenstream], len, input_str) | |
numeric(char) -> | |
{token, cursor} = handlenum(current, "", input_str) | |
lex(cursor, [{token, :number} | tokenstream], len, input_str) | |
alphanumeric(char) -> | |
{token, cursor} = handlealpha(current, "", input_str) | |
lex(cursor, [{token, :identifier} | tokenstream], len, input_str) | |
# Conditional tokens | |
arithmetic(char) -> | |
unless String.at(input_str, current + 1) == "=" do | |
lex(current + 1, [{char, :arithmetic} | tokenstream], len, input_str) | |
else | |
lex(current + 2, [{char <> "=", :assignment} | tokenstream], len, input_str) | |
end | |
true -> | |
# ignore all other characters | |
lex(current + 1, tokenstream, len, input_str) | |
end | |
else | |
# we do this because tokens were prepended rather than appended | |
# this is due to some complications within Elixir. | |
Enum.reverse(tokenstream) | |
end | |
end | |
end | |
# interfacing | |
inp = IO.gets("Input your program: ") | |
Lexer.lex(String.length(inp), inp) |> IO.inspect() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment