Skip to content

Instantly share code, notes, and snippets.

@nilium
Last active August 29, 2015 13:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nilium/8762881 to your computer and use it in GitHub Desktop.
Save nilium/8762881 to your computer and use it in GitHub Desktop.
class Lexer
include Enumerable
def self.const_set(sym, obj) const_set(sym, obj.freeze) end
class Error < ::Exception ; end
# Token = Struct.new(:kind, :value, :position, :line, :column, :__mark)
class Token
attr_reader *%i[kind value position line column]
class << self
alias_method :[], :new
end
def initialize(kind, value, position, line, column, lexer)
@kind = kind
@value = value
@position = position
@line = line
@column = column
@lexer = lexer
end
def mark
__mark.call
self
end
def method_missing(m, *args)
id_s = m.to_s
if id_s.end_with?('?')
id_s = id_s.chomp!('?').to_sym
kind == id_s.to_sym
else
raise NoMethodError
end
end
def to_s
"(#{
kind.to_s.ljust(7)
} #{
"#{position}:#{line}:#{column}".ljust(16)
} #{
value.inspect
})"
end
end
WHITESPACE = " \r\n\t"
ESCAPE_CHARS = "abfnrtv\\\"?0\n"
DIGITS = '0123456789'
HEX_SEPERATOR = 'xX'
EXP_SEPERATOR = 'eE'
EXP_PLUSMINUS = '+-'
DOT = '.'
OPEN_BRACKETS = '([{'
CLOSE_BRACKETS = ')]}'
BRACKETS = "#{OPEN_BRACKETS}#{CLOSE_BRACKETS}"
QUOTES = "'#"
HEX_DIGITS = "#{DIGITS}abcdefABCDEF"
NUMBER_BOUND = "#{QUOTES}#{BRACKETS}#{WHITESPACE}"
STRING_SENTINELS = '/"'
INVALID_INPUT =
"Invalid lexer input -- expected enumerator, enumerable or string-like"
def initialize(char_input)
@input =
case
when char_input.kind_of?(Enumerator) then char_input
when char_input.respond_to?(:each_char) then char_input.each_char
when char_input.respond_to?(:each) then char_input.each
else raise ArgumentError, INVALID_INPUT
end
@buffer = ''
@include_peeked = -> (set) { set.include?(peek) }
@line = 1
@column = 0
@position = 0
@token_pos = [@line, @column, @position]
@mark = 0
end
def each(&block)
if block_given?
while has_next?
token = next_token
break if token.nil?
yield token
end
self
else
self.to_enum(:each)
end
end
def flush
temp = @buffer
@buffer = ''
temp
end
def emit(kind)
pos, line, col = *@token_pos
Token[kind, flush, pos, line, col, -> { @mark = pos }]
end
def eof?
!has_next?
end
def eof
emit :eof
end
def bracket?
accept(BRACKETS)
end
def bracket
emit :bracket
end
def number?
(accept(EXP_PLUSMINUS) && accept(DIGITS)) ||
(@buffer.empty? && accept(DIGITS))
end
def number
if last == '0' && accept(HEX_SEPERATOR) && !accept_run(HEX_DIGITS)
expected 'hex digits', but_got: peek.inspect
else
accept_run(DIGITS)
if accept('.') && !accept_run(DIGITS)
expected "decimal number (.[0-9]+)"
end
end
if accept(EXP_SEPERATOR)
accept(EXP_PLUSMINUS)
expected "exponent ( /e[+-]?\\d+/i )" unless accept_run(DIGITS)
end
unless check(NUMBER_BOUND)
expected "end of number", but_got: "continued numberness"
end
emit :number
end
def quote?
accept(QUOTES)
end
def quote
emit :quote
end
def string?
check(STRING_SENTINELS)
end
def string
sentinel = read(buffer: false)
while has_next? do
unless accept_until('\\', sentinel)
expected "\\ or closing #{sentinel}", but_got: peek.inspect
end
case peek
when '\\' then unexpected_eof if read.empty? || read.empty?
when sentinel
unexpected_eof if read(buffer: false).empty?
break
else unexpected_eof expected: "closing #{sentinel}"
end
unexpected_eof expected: "closing #{sentinel}" unless has_next?
end
case sentinel
when '"' then emit :string
when '/' then emit :regex
else emit :charseq
end
end
def name?
accept_until(" ()\"'") || !@buffer.empty?
end
def name
emit :name
end
def unexpected_eof(expected: nil)
if expected
raise Error, "Unexpected end of text - expected #{expected}"
else
raise Error, "Unexpected end of text", *args
end
end
def expected(expected, but_got: peek.inspect)
if but_got
raise Error, "Expected #{expected}, but got #{but_got}"
else
raise Error, "Expected #{expected}"
end
end
def next_token
skip_run WHITESPACE
@token_pos = [@position, @line, @column]
case
when eof? then eof
when quote? then quote
when string? then string
when bracket? then bracket
when number? then number
when name? then name
else raise Error, "Unexpected character: #{peek.inspect}"
end
rescue
warn "Buffer at error point in lexer: #{@buffer.inspect}"
raise
end
def read(buffer: true)
char = @input.next
@buffer << char if buffer
@position += 1
if char == "\n"
@line += 1
@column = 0
else
@column += 1
end
char
rescue StopIteration => done
''
end
def skip(*chars)
if chars.length > 0
if has_next? && chars.any?(&@include_peeked)
skip
else
false
end
else
begin
read(buffer: false) != ''
rescue StopIteration => done
false
end
end
end
def peek
@input.peek
rescue StopIteration => done
''
end
def check(*chars, &block)
block = @include_peeked unless block_given?
has_next? && chars.any?(&block)
end
def last
(!@buffer.empty? && @buffer[-1]) || ''
end
def accept(*chars)
if has_next? && chars.any?(&@include_peeked)
c = self.read
yield c if block_given?
true
else
false
end
end
def accept_run(*chars)
blen = @buffer.length
while has_next? && chars.any?(&@include_peeked)
c = self.read
yield c if block_given?
end
blen < @buffer.length
end
def accept_until(*chars)
blen = @buffer.length
until !has_next? || chars.any?(&@include_peeked)
c = self.read
yield c if block_given?
end
blen < @buffer.length
end
def skip_run(*chars)
while skip(*chars) ; end
self
end
def has_next?
!peek.empty?
end
def rewind
@input.rewind
if @mark > 0
(0 ... @mark).each { skip }
@mark = 0
end
self
end
def mark
@mark = @position
end
end
def lex(input)
Lexer.new(input).each
end
# String extension to get unindented heredoc strings.
class String
def unindented(column = nil)
column ||= each_line.map { |l| l[/^\s*/].length }.min
each_line.map { |l| l[column..-1] }.join
end
end
ltok = lex(<<'EOS'.unindented)
(echo 'bar 'baz "some string\" with an escape"
/some \r[foo bar] regex/
'(+
0x1
-02
3.5
4.5e-10
)
)
EOS
puts ltok.peek
puts ltok.peek
puts ltok.peek
ltok.each(&method(:puts))
puts "\nrewinding\n\n"
ltok.rewind
ltok.each { |token|
puts token
if token.number?
puts "breaking"
break
end
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment