Last active
August 29, 2015 13:55
-
-
Save nilium/8762881 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Lexer | |
include Enumerable | |
def self.const_set(sym, obj) const_set(sym, obj.freeze) end | |
class Error < ::Exception ; end | |
# Token = Struct.new(:kind, :value, :position, :line, :column, :__mark) | |
class Token | |
attr_reader *%i[kind value position line column] | |
class << self | |
alias_method :[], :new | |
end | |
def initialize(kind, value, position, line, column, lexer) | |
@kind = kind | |
@value = value | |
@position = position | |
@line = line | |
@column = column | |
@lexer = lexer | |
end | |
def mark | |
__mark.call | |
self | |
end | |
def method_missing(m, *args) | |
id_s = m.to_s | |
if id_s.end_with?('?') | |
id_s = id_s.chomp!('?').to_sym | |
kind == id_s.to_sym | |
else | |
raise NoMethodError | |
end | |
end | |
def to_s | |
"(#{ | |
kind.to_s.ljust(7) | |
} #{ | |
"#{position}:#{line}:#{column}".ljust(16) | |
} #{ | |
value.inspect | |
})" | |
end | |
end | |
WHITESPACE = " \r\n\t" | |
ESCAPE_CHARS = "abfnrtv\\\"?0\n" | |
DIGITS = '0123456789' | |
HEX_SEPERATOR = 'xX' | |
EXP_SEPERATOR = 'eE' | |
EXP_PLUSMINUS = '+-' | |
DOT = '.' | |
OPEN_BRACKETS = '([{' | |
CLOSE_BRACKETS = ')]}' | |
BRACKETS = "#{OPEN_BRACKETS}#{CLOSE_BRACKETS}" | |
QUOTES = "'#" | |
HEX_DIGITS = "#{DIGITS}abcdefABCDEF" | |
NUMBER_BOUND = "#{QUOTES}#{BRACKETS}#{WHITESPACE}" | |
STRING_SENTINELS = '/"' | |
INVALID_INPUT = | |
"Invalid lexer input -- expected enumerator, enumerable or string-like" | |
def initialize(char_input) | |
@input = | |
case | |
when char_input.kind_of?(Enumerator) then char_input | |
when char_input.respond_to?(:each_char) then char_input.each_char | |
when char_input.respond_to?(:each) then char_input.each | |
else raise ArgumentError, INVALID_INPUT | |
end | |
@buffer = '' | |
@include_peeked = -> (set) { set.include?(peek) } | |
@line = 1 | |
@column = 0 | |
@position = 0 | |
@token_pos = [@line, @column, @position] | |
@mark = 0 | |
end | |
def each(&block) | |
if block_given? | |
while has_next? | |
token = next_token | |
break if token.nil? | |
yield token | |
end | |
self | |
else | |
self.to_enum(:each) | |
end | |
end | |
def flush | |
temp = @buffer | |
@buffer = '' | |
temp | |
end | |
def emit(kind) | |
pos, line, col = *@token_pos | |
Token[kind, flush, pos, line, col, -> { @mark = pos }] | |
end | |
def eof? | |
!has_next? | |
end | |
def eof | |
emit :eof | |
end | |
def bracket? | |
accept(BRACKETS) | |
end | |
def bracket | |
emit :bracket | |
end | |
def number? | |
(accept(EXP_PLUSMINUS) && accept(DIGITS)) || | |
(@buffer.empty? && accept(DIGITS)) | |
end | |
def number | |
if last == '0' && accept(HEX_SEPERATOR) && !accept_run(HEX_DIGITS) | |
expected 'hex digits', but_got: peek.inspect | |
else | |
accept_run(DIGITS) | |
if accept('.') && !accept_run(DIGITS) | |
expected "decimal number (.[0-9]+)" | |
end | |
end | |
if accept(EXP_SEPERATOR) | |
accept(EXP_PLUSMINUS) | |
expected "exponent ( /e[+-]?\\d+/i )" unless accept_run(DIGITS) | |
end | |
unless check(NUMBER_BOUND) | |
expected "end of number", but_got: "continued numberness" | |
end | |
emit :number | |
end | |
def quote? | |
accept(QUOTES) | |
end | |
def quote | |
emit :quote | |
end | |
def string? | |
check(STRING_SENTINELS) | |
end | |
def string | |
sentinel = read(buffer: false) | |
while has_next? do | |
unless accept_until('\\', sentinel) | |
expected "\\ or closing #{sentinel}", but_got: peek.inspect | |
end | |
case peek | |
when '\\' then unexpected_eof if read.empty? || read.empty? | |
when sentinel | |
unexpected_eof if read(buffer: false).empty? | |
break | |
else unexpected_eof expected: "closing #{sentinel}" | |
end | |
unexpected_eof expected: "closing #{sentinel}" unless has_next? | |
end | |
case sentinel | |
when '"' then emit :string | |
when '/' then emit :regex | |
else emit :charseq | |
end | |
end | |
def name? | |
accept_until(" ()\"'") || !@buffer.empty? | |
end | |
def name | |
emit :name | |
end | |
def unexpected_eof(expected: nil) | |
if expected | |
raise Error, "Unexpected end of text - expected #{expected}" | |
else | |
raise Error, "Unexpected end of text", *args | |
end | |
end | |
def expected(expected, but_got: peek.inspect) | |
if but_got | |
raise Error, "Expected #{expected}, but got #{but_got}" | |
else | |
raise Error, "Expected #{expected}" | |
end | |
end | |
def next_token | |
skip_run WHITESPACE | |
@token_pos = [@position, @line, @column] | |
case | |
when eof? then eof | |
when quote? then quote | |
when string? then string | |
when bracket? then bracket | |
when number? then number | |
when name? then name | |
else raise Error, "Unexpected character: #{peek.inspect}" | |
end | |
rescue | |
warn "Buffer at error point in lexer: #{@buffer.inspect}" | |
raise | |
end | |
def read(buffer: true) | |
char = @input.next | |
@buffer << char if buffer | |
@position += 1 | |
if char == "\n" | |
@line += 1 | |
@column = 0 | |
else | |
@column += 1 | |
end | |
char | |
rescue StopIteration => done | |
'' | |
end | |
def skip(*chars) | |
if chars.length > 0 | |
if has_next? && chars.any?(&@include_peeked) | |
skip | |
else | |
false | |
end | |
else | |
begin | |
read(buffer: false) != '' | |
rescue StopIteration => done | |
false | |
end | |
end | |
end | |
def peek | |
@input.peek | |
rescue StopIteration => done | |
'' | |
end | |
def check(*chars, &block) | |
block = @include_peeked unless block_given? | |
has_next? && chars.any?(&block) | |
end | |
def last | |
(!@buffer.empty? && @buffer[-1]) || '' | |
end | |
def accept(*chars) | |
if has_next? && chars.any?(&@include_peeked) | |
c = self.read | |
yield c if block_given? | |
true | |
else | |
false | |
end | |
end | |
def accept_run(*chars) | |
blen = @buffer.length | |
while has_next? && chars.any?(&@include_peeked) | |
c = self.read | |
yield c if block_given? | |
end | |
blen < @buffer.length | |
end | |
def accept_until(*chars) | |
blen = @buffer.length | |
until !has_next? || chars.any?(&@include_peeked) | |
c = self.read | |
yield c if block_given? | |
end | |
blen < @buffer.length | |
end | |
def skip_run(*chars) | |
while skip(*chars) ; end | |
self | |
end | |
def has_next? | |
!peek.empty? | |
end | |
def rewind | |
@input.rewind | |
if @mark > 0 | |
(0 ... @mark).each { skip } | |
@mark = 0 | |
end | |
self | |
end | |
def mark | |
@mark = @position | |
end | |
end | |
def lex(input) | |
Lexer.new(input).each | |
end | |
# String extension to get unindented heredoc strings. | |
class String | |
def unindented(column = nil) | |
column ||= each_line.map { |l| l[/^\s*/].length }.min | |
each_line.map { |l| l[column..-1] }.join | |
end | |
end | |
ltok = lex(<<'EOS'.unindented) | |
(echo 'bar 'baz "some string\" with an escape" | |
/some \r[foo bar] regex/ | |
'(+ | |
0x1 | |
-02 | |
3.5 | |
4.5e-10 | |
) | |
) | |
EOS | |
puts ltok.peek | |
puts ltok.peek | |
puts ltok.peek | |
ltok.each(&method(:puts)) | |
puts "\nrewinding\n\n" | |
ltok.rewind | |
ltok.each { |token| | |
puts token | |
if token.number? | |
puts "breaking" | |
break | |
end | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment