VideoCarp/lexer.ex

## lexer.ex
# A small, unoptimized lexer in the Elixir programming language.
# No mutable values are used.
# This lexer was much, much easier for me to write and debug than when I had done it
# using imperative programming concepts such as loops and mutable values.
# This can be optimized by consuming the input string, instead of indexing.

defmodule Lexer do
    # char info
    def numeric(char) do
        char >= "0" and char <= "9"
    end

    def alphanumeric(char) do
        numeric(char) || (char >= "a" && char <= "z") || (char >= "A" && char <= "Z") || char == "_"
    end

    def arithmetic(char) do
        Enum.member?(["+", "-", "*", "/", "%"], char)
    end


    # Handlers
    def handlenum(cursor, temp, input_strr) do
        character = String.at(input_strr, cursor)
        if numeric(character) do
            handlenum(cursor + 1, temp <> character, input_strr)
        else
            {temp, cursor}
        end
    end

    def handlestring(cursor, temp, input_strr) do
        character = String.at(input_strr, cursor)
        if character != "\"" do
            handlestring(cursor + 1, temp <> character, input_strr)
        else
            {temp, cursor}
        end
    end

    def handlealpha(cursor, temp, input_strr) do
        character = String.at(input_strr, cursor)
        if alphanumeric(character) do
            handlealpha(cursor + 1, temp <> character, input_strr)
        else
            {temp, cursor}
        end
    end


    # lex int, list, int, string -> [{string, atom}]
    def lex(current \\ 0, tokenstream \\ [], len, input_str) do
        char = String.at(input_str, current)
        unless current >= len do
            cond do

                # single-char tokens
                char == ")" ->
                    lex(current + 1, [{char, :cparen} | tokenstream], len, input_str)


                char == "(" ->
                    lex(current + 1, [{char, :oparen} | tokenstream], len, input_str)


                char == "{" ->
                    lex(current + 1, [{char, :obrace} | tokenstream], len, input_str)


                char == "}" ->
                    lex(current + 1, [{char, :cbrace} | tokenstream], len, input_str)


                char == "=" ->
                    lex(current + 1, [{char, :assignment} | tokenstream], len, input_str)


                # Tokens requiring repetition
                char == "\"" ->
                    {token, cursor} = handlestring(current + 1, "", input_str)
                    # Increments to manage "" existing but not needed.
                    lex(cursor + 1, [{token, :str} | tokenstream], len, input_str)


                numeric(char) ->
                    {token, cursor} = handlenum(current, "", input_str)
                    lex(cursor, [{token, :number} | tokenstream], len, input_str)


                alphanumeric(char) ->
                    {token, cursor} = handlealpha(current, "", input_str)
                    lex(cursor, [{token, :identifier} | tokenstream], len, input_str)


                # Conditional tokens
                arithmetic(char) ->
                    unless String.at(input_str, current + 1) == "=" do
                        lex(current + 1, [{char, :arithmetic} | tokenstream], len, input_str)
                    else
                        lex(current + 2, [{char <> "=", :assignment} | tokenstream], len, input_str)
                    end


                true ->
                    # ignore all other characters
                    lex(current + 1, tokenstream, len, input_str)
            end
        else
            # we do this because tokens were prepended rather than appended
            # this is due to some complications within Elixir.
            Enum.reverse(tokenstream)
        end
    end
end

# interfacing
inp = IO.gets("Input your program: ")
Lexer.lex(String.length(inp), inp) |> IO.inspect()
	# A small, unoptimized lexer in the Elixir programming language.
	# No mutable values are used.
	# This lexer was much, much easier for me to write and debug than when I had done it
	# using imperative programming concepts such as loops and mutable values.
	# This can be optimized by consuming the input string, instead of indexing.

	defmodule Lexer do
	# char info
	def numeric(char) do
	char >= "0" and char <= "9"
	end

	def alphanumeric(char) do
	numeric(char) \|\| (char >= "a" && char <= "z") \|\| (char >= "A" && char <= "Z") \|\| char == "_"
	end

	def arithmetic(char) do
	Enum.member?(["+", "-", "*", "/", "%"], char)
	end


	# Handlers
	def handlenum(cursor, temp, input_strr) do
	character = String.at(input_strr, cursor)
	if numeric(character) do
	handlenum(cursor + 1, temp <> character, input_strr)
	else
	{temp, cursor}
	end
	end

	def handlestring(cursor, temp, input_strr) do
	character = String.at(input_strr, cursor)
	if character != "\"" do
	handlestring(cursor + 1, temp <> character, input_strr)
	else
	{temp, cursor}
	end
	end

	def handlealpha(cursor, temp, input_strr) do
	character = String.at(input_strr, cursor)
	if alphanumeric(character) do
	handlealpha(cursor + 1, temp <> character, input_strr)
	else
	{temp, cursor}
	end
	end


	# lex int, list, int, string -> [{string, atom}]
	def lex(current \\ 0, tokenstream \\ [], len, input_str) do
	char = String.at(input_str, current)
	unless current >= len do
	cond do

	# single-char tokens
	char == ")" ->
	lex(current + 1, [{char, :cparen} \| tokenstream], len, input_str)


	char == "(" ->
	lex(current + 1, [{char, :oparen} \| tokenstream], len, input_str)


	char == "{" ->
	lex(current + 1, [{char, :obrace} \| tokenstream], len, input_str)


	char == "}" ->
	lex(current + 1, [{char, :cbrace} \| tokenstream], len, input_str)


	char == "=" ->
	lex(current + 1, [{char, :assignment} \| tokenstream], len, input_str)


	# Tokens requiring repetition
	char == "\"" ->
	{token, cursor} = handlestring(current + 1, "", input_str)
	# Increments to manage "" existing but not needed.
	lex(cursor + 1, [{token, :str} \| tokenstream], len, input_str)


	numeric(char) ->
	{token, cursor} = handlenum(current, "", input_str)
	lex(cursor, [{token, :number} \| tokenstream], len, input_str)


	alphanumeric(char) ->
	{token, cursor} = handlealpha(current, "", input_str)
	lex(cursor, [{token, :identifier} \| tokenstream], len, input_str)


	# Conditional tokens
	arithmetic(char) ->
	unless String.at(input_str, current + 1) == "=" do
	lex(current + 1, [{char, :arithmetic} \| tokenstream], len, input_str)
	else
	lex(current + 2, [{char <> "=", :assignment} \| tokenstream], len, input_str)
	end


	true ->
	# ignore all other characters
	lex(current + 1, tokenstream, len, input_str)
	end
	else
	# we do this because tokens were prepended rather than appended
	# this is due to some complications within Elixir.
	Enum.reverse(tokenstream)
	end
	end
	end

	# interfacing
	inp = IO.gets("Input your program: ")
	Lexer.lex(String.length(inp), inp) \|> IO.inspect()