March 9, 2010
sys: require('sys')
Lexer: require('../lib/lexer').Lexer
INTERPOLATION : /^\$([a-zA-Z_@]\w*)/
class PartialLexer
constructor: (chunk) ->
@i: 0
@chunk: chunk
@tokens: []
token: (tag, value) ->
@tokens.push([tag, value])
tag: -> false
# Work area
# ---------
# Matches regular expression literals. Lexing regular expressions is difficult
# to distinguish from division, so we borrow some basic heuristics from
# JavaScript and Ruby.
regex_token: ->
return false unless regex: @balanced_token supress: true, ['/', '/']
return false if regex.match /^\/\s+|\n/
return false if include NOT_REGEX, @tag()
flags: ['i', 'm', 'g', 'y']
regex += flags[index] while (index: flags.indexOf @chunk.substr regex.length, 1) >= 0
if regex.indexOf '}' > regex.indexOf '${'
[regex, flags]: regex.substring(1).split('/')
@tokens: @tokens.concat [['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]
@interpolate_string "\"${regex.replace('\\', '\\\\')}\""
@tokens: @tokens.concat [[',', ','], ['STRING', "'$flags'"], ['CALL_END', ')'], [')', ')']]
@token 'REGEX', regex
@i += regex.length
# Used methods with no changes
# ----------------------------
# Matches a balanced group such as a single or double-quoted string. Pass in
# a series of delimiters, all of which must be nested correctly within the
# contents of the string. This method allows us to have strings within
# interpolations within strings etc...
balanced_string: (str, supress, delimited...) ->
levels: []
i: 0
while i < str.length
for pair in delimited
[open, close]: pair
if levels.length and starts str, '\\', i
i += 1
else if levels.length and starts(str, close, i) and levels[levels.length - 1] is pair
i += close.length - 1
i += 1 unless levels.length
else if starts str, open, i
i += open.length - 1
break unless levels.length
i += 1
if levels.length
throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" unless supress
return false
return false if i is 0
return str.substring(0, i)
# Matches a token in which which the passed delimiter pairs must be correctly
# balanced (ie. strings, JS literals).
balanced_token: (supress, delimited...) ->
@balanced_string @chunk, supress, delimited...
# Expand variables and expressions inside double-quoted strings using
# [ECMA Harmony's interpolation syntax](
# for substitution of bare variables as well as arbitrary expressions.
# "Hello $name."
# "Hello ${name.capitalize()}."
# If it encounters an interpolation, this method will recursively create a
# new Lexer, tokenize the interpolated contents, and merge them into the
# token stream.
interpolate_string: (str) ->
if str.length < 3 or not starts str, '"'
@token 'STRING', str
lexer: new Lexer()
tokens: []
quote: str.substring(0, 1)
[i, pi]: [1, 1]
while i < str.length - 1
if starts str, '\\', i
i += 1
else if match: str.substring(i).match INTERPOLATION
[group, interp]: match
interp: "this.${ interp.substring(1) }" if starts interp, '@'
tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
tokens.push ['IDENTIFIER', interp]
i += group.length - 1
pi: i + 1
else if (expr: @balanced_string str.substring(i), supress: false, ['${', '}'])
tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
inner: expr.substring(2, expr.length - 1)
if inner.length
nested: lexer.tokenize "($inner)", {rewrite: no, line: @line}
tokens.push ['TOKENS', nested]
tokens.push ['STRING', "$quote$quote"]
i += expr.length - 1
pi: i + 1
i += 1
tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i and pi < str.length - 1
for each, i in tokens
if each[0] is 'TOKENS'
@tokens: @tokens.concat each[1]
@token each[0], each[1]
@token '+', '+' if i < tokens.length - 1
# Does a list include a value?
include: (list, value) ->
list.indexOf(value) >= 0
# Peek at the beginning of a given string to see if it matches a sequence.
starts: (string, literal, start) ->
string.substring(start, (start or 0) + literal.length) is literal
# Tokens which a regular expression will never immediately follow, but which
# a division operator might.
# See:
# Our list is shorter, due to sans-parentheses method calls.
'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE'
a: new PartialLexer(/\d+\s+/.toString() + ') was a Happy Bunny.')
puts a.tokens.join '\n'
