Created
January 10, 2017 20:02
-
-
Save quephird/b06651c328ef6918210710c7a80297d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Lexer | |
KEYWORDS = ["def", "class", "if", "true", "false", "nil"] | |
IDENTIFIER_REGEX = /\A([a-z]\w*)/ | |
CONSTANT_REGEX = /\A([A-Z]\w*)/ | |
NUMBER_REGEX = /\A([0-9]+)/ | |
STRING_REGEX = /\A"([^"]*)"/ | |
NEW_BLOCK_REGEX = /\A\:\n( +)/m | |
INDENT_REGEX = /\A\n( *)/m | |
OPERATOR_REGEX = /\A(\|\||&&|==|!=|<=|>=)/ | |
SPACE_REGEX = /\A / | |
def tokenize_old(code) | |
# Remove extra line breaks | |
code.chomp! | |
# This will hold the generated tokens | |
tokens = [] | |
# Number of spaces in the last indent | |
current_indent = 0 | |
indent_stack = [] | |
# Current character position | |
i = 0 | |
while i < code.size | |
chunk = code[ i..-1 ] | |
if identifier = chunk[IDENTIFIER_REGEX, 1] | |
if KEYWORDS.include?(identifier) | |
tokens << [identifier.upcase.to_sym, identifier] | |
else | |
tokens << [:IDENTIFIER, identifier] | |
end | |
# Skip what we just parsed | |
i += identifier.size | |
elsif constant = chunk[CONSTANT_REGEX, 1] | |
tokens << [:CONSTANT, constant] | |
i += constant.size | |
elsif number = chunk[NUMBER_REGEX, 1] | |
tokens << [:NUMBER, number.to_i] | |
i += number.size | |
elsif string = chunk[STRING_REGEX, 1] | |
tokens << [:STRING, string] | |
# Skip two more to exclude the `"`s. | |
i += string.size + 2 | |
elsif indent = chunk[NEW_BLOCK_REGEX, 1] | |
# Indent should go up when creating a block | |
if indent.size <= current_indent | |
raise "Bad indent level, got #{indent.size} indents, " + | |
"expected > #{current_indent}" | |
end | |
current_indent = indent.size | |
indent_stack.push(current_indent) | |
tokens << [:INDENT, indent.size] | |
i += indent.size + 2 | |
elsif indent = chunk[INDENT_REGEX, 1] | |
# Case 2 | |
if indent.size == current_indent | |
# Nothing to do, we're still in the same block | |
tokens << [:NEWLINE, "\n"] | |
# Case 3 | |
elsif indent.size < current_indent | |
while indent.size < current_indent | |
indent_stack.pop | |
current_indent = indent_stack.last || 0 | |
tokens << [:DEDENT, indent.size] | |
end | |
tokens << [:NEWLINE, "\n"] | |
# indent.size > current_indent, error! | |
else | |
# Cannot increase indent level without using ":" | |
raise "Missing ':'" | |
end | |
i += indent.size + 1 | |
elsif operator = chunk[OPERATOR_REGEX, 1] | |
tokens << [operator, operator] | |
i += operator.size | |
elsif chunk.match(SPACE_REGEX) | |
i += 1 | |
else | |
value = chunk[0, 1] | |
tokens << [value, value] | |
i += 1 | |
end | |
end | |
while indent = indent_stack.pop | |
tokens << [:DEDENT, indent_stack.first || 0] | |
end | |
tokens | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment