Instantly share code, notes, and snippets.

@belkadan /lexer.rb Secret
Last active Aug 29, 2015

Embed
What would you like to do?
CS Mini-Tutorial: State Machines
## Return true if l is a single letter (or underscore), false if not.
def letter?(l)
case l
when 'a'..'z', 'A'..'Z', '_'
return true
else
return false
end
end
## Return true if l is a single base-10 digit, false if not.
def digit?(l)
case l
when '0'..'9'
return true
else
return false
end
end
## Return true if l is a space, tab, or newline.
def space?(l)
case l
when ' ', '\t', '\n'
return true
else
return false
end
end
## Return true if l is a single or double quote.
def quote?(l)
case l
when '"', "'"
return true
else
return false
end
end
## Splits the input into tokens and yields them to the given block.
#
# After we run out of tokens, yield a single nil to show that we're done.
def lex(input)
# Delete extra spaces from the beginning and end.
input.strip!
until input.empty?
tok = nil # See sanity check below.
case input[0]
when 'a'..'z', 'A'..'Z', '_'
# Split on word boundaries.
i = 0
i += 1 while letter?(input[i])
tok = input.slice!(0...i)
when '"', "'"
# Look for the same kind of quote later in the input.
# A more complicated lexer would give you a way to put double quotes
# inside other double quotes, usually by putting a backslash first: \".
# This one doesn't bother.
end_quote_idx = input.index(input[0], 1)
raise "missing ending quote" if end_quote_idx.nil?
tok = input.slice!(0..end_quote_idx)
when '(', ')', ',', ':', '-', '>'
# Just take the single character.
tok = input.slice!(0)
when '0'..'9'
i = 0
i += 1 while digit?(input[i])
tok = input.slice!(0...i)
else
raise "unexpected character: #{input[0]}"
end
raise "failed to account for a case" if tok.nil?
yield tok
# We ate a word. The next thing is probably one or more spaces. Delete them!
input.lstrip!
end
# We're done; let our consumer know it.
yield nil
end
if __FILE__ == $0
# For each line in the input, pass it to our lexer and print out the tokens it generates.
# Stop when the user puts in a blank line. (Make sure we account for the newline character.)
ARGF.each do |line|
exit if line.chomp.empty?
lex(line) do |token|
puts(token) unless token.nil?
end
puts
end
end
require_relative 'lexer'
## Returns true if the input is a valid state name:
# either a number or quoted text.
def valid_state_name?(token)
return digit?(token[0]) || quote?(token[0])
end
class State
def initialize(name, is_start, is_final)
@name = name
@start = is_start
@final = is_final
@transitions = [] # Start with no transitions.
end
def name
@name
end
def start?
@start
end
def final?
@final
end
def transitions
@transitions
end
end
## All kinds of transition are one of these.
class Transition
def destination_name
return @destination_name
end
def destination_name=(dst)
raise "destination already set" unless @destination_name.nil?
@destination_name = dst
end
end
## Represents "any-transitions" that match against sets.
class AnyTransition < Transition
def initialize(kind)
# Check to make sure this is a kind we know how to handle.
case kind
when "letter", "number", "space", "symbol"
# do nothing
else
raise "unknown set for 'any': #{kind}"
end
@kind = kind
end
# Get a textual representation of this transition.
def to_s
"any #{@kind} -> #{self.destination_name || 'error'}"
end
end
## Represents "plain-transitions" that match against text.
class PlainTransition < Transition
def initialize(quoted_text)
@quoted_text = quoted_text
end
# Get a textual representation of this transition.
def to_s
"#{@quoted_text} -> #{self.destination_name || 'error'}"
end
end
## Parses a "new state" production.
def parse_state(lexer)
# Allowed patterns:
# state 1
# start state 2
# final state 3
# start final state 4
is_start = false
is_final = false
token = lexer.next()
if token == "start"
is_start = true
token = lexer.next()
end
if token == "final"
is_final = true
token = lexer.next()
end
if token == "state"
# Ignore the word "state"; it's optional when you have "start" or "final".
token = lexer.next()
end
raise "expected new state label, but got '#{token}'" unless valid_state_name?(token)
colon = lexer.next()
raise "expected colon after state, but got '#{colon}'" unless colon == ':'
return State.new(token, is_start, is_final)
end
## Parses a "new transition" production.
def parse_transition(lexer)
# Allowed patterns:
# any letter -> 1
# "abc" -> 2
# 'def' -> 3
transition_label = lexer.next()
if transition_label == "any"
kind = lexer.next()
transition = AnyTransition.new(kind)
else
raise "expected transition label, but got '#{transition_label}" unless quote?(transition_label[0])
transition = PlainTransition.new(transition_label)
end
arrow_start = lexer.next()
raise "expected -> after transition label, but got '#{arrow_start}'" unless arrow_start == '-'
arrow_end = lexer.next()
raise "expected -> after transition label, but got '#{arrow_start}#{arrow_end}'" unless arrow_end == '>'
destination_name = lexer.next()
if destination_name == "error"
# Leave the destination empty in the transition to mean "error".
else
raise "expected destination state label, but got '#{destination_name}'" unless valid_state_name?(destination_name)
transition.destination_name = destination_name
end
return transition
end
## Given text input, parse it into a list of states, each with a list of transitions.
def parse(input)
# Use our lex() function from last time,
# but put it in a form where we can get words on demand.
lexer = enum_for(:lex, input)
# Start with no states and no current state.
states = []
current_state = nil
while true
# Look at the first token from the lexer, but don't consume it yet.
first_token = lexer.peek()
# If we're done, jump out of the loop!
break if first_token.nil?
# Are we trying to make a new state or a new transition?
case first_token
when "state", "start", "final"
current_state = parse_state(lexer)
states << current_state
else
new_transition = parse_transition(lexer)
raise "no current state" if current_state.nil?
current_state.transitions << new_transition
end
end
return states
end
if __FILE__ == $0
# Parse the entire input file, then print out each resulting state.
parse(ARGF.read()).each do |state|
# First print the state's name and attributes.
print("#{state.name}")
if state.start? && state.final?
print(" (start, final)")
elsif state.start?
print(" (start)")
elsif state.final?
print(" (final)")
end
# What's the difference between 'print' and 'puts'?
# The latter adds a line break; the former doesn't.
puts(":")
# Then print all its transitions.
state.transitions.each do |transition|
puts("\t#{transition.to_s}")
end
puts
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment