StanAngeloff/partial_lexer.coffee

## partial_lexer.coffee
sys: require('sys')
Lexer: require('../lib/lexer').Lexer

INTERPOLATION : /^\$([a-zA-Z_@]\w*)/

class PartialLexer

  constructor: (chunk) ->
    @i:      0
    @chunk:  chunk
    @tokens: []

  token: (tag, value) ->
    @tokens.push([tag, value])

  tag: -> false

  # Work area
  # ---------

  # Matches regular expression literals. Lexing regular expressions is difficult
  # to distinguish from division, so we borrow some basic heuristics from
  # JavaScript and Ruby.
  regex_token: ->
    return false unless regex: @balanced_token supress: true, ['/', '/']
    return false if regex.match /^\/\s+|\n/
    return false if include NOT_REGEX, @tag()
    flags: ['i', 'm', 'g', 'y']
    regex += flags[index] while (index: flags.indexOf @chunk.substr regex.length, 1) >= 0
    if regex.indexOf '}' > regex.indexOf '${'
      [regex, flags]: regex.substring(1).split('/')
      @tokens: @tokens.concat [['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]
      @interpolate_string "\"${regex.replace('\\', '\\\\')}\""
      @tokens: @tokens.concat [[',', ','], ['STRING', "'$flags'"], ['CALL_END', ')'], [')', ')']]
    else
      @token 'REGEX', regex
    @i += regex.length
    true

  # Used methods with no changes
  # ----------------------------

  # Matches a balanced group such as a single or double-quoted string. Pass in
  # a series of delimiters, all of which must be nested correctly within the
  # contents of the string. This method allows us to have strings within
  # interpolations within strings etc...
  balanced_string: (str, supress, delimited...) ->
    levels: []
    i: 0
    while i < str.length
      for pair in delimited
        [open, close]: pair
        if levels.length and starts str, '\\', i
          i += 1
          break
        else if levels.length and starts(str, close, i) and levels[levels.length - 1] is pair
          levels.pop()
          i += close.length - 1
          i += 1 unless levels.length
          break
        else if starts str, open, i
          levels.push(pair)
          i += open.length - 1
          break
      break unless levels.length
      i += 1
    if levels.length
      throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" unless supress
      return false
    return false if i is 0
    return str.substring(0, i)

  # Matches a token in which which the passed delimiter pairs must be correctly
  # balanced (ie. strings, JS literals).
  balanced_token: (supress, delimited...) ->
    @balanced_string @chunk, supress, delimited...

  # Expand variables and expressions inside double-quoted strings using
  # [ECMA Harmony's interpolation syntax](http://wiki.ecmascript.org/doku.php?id=strawman:string_interpolation)
  # for substitution of bare variables as well as arbitrary expressions.
  #
  #     "Hello $name."
  #     "Hello ${name.capitalize()}."
  #
  # If it encounters an interpolation, this method will recursively create a
  # new Lexer, tokenize the interpolated contents, and merge them into the
  # token stream.
  interpolate_string: (str) ->
    if str.length < 3 or not starts str, '"'
      @token 'STRING', str
    else
      lexer:    new Lexer()
      tokens:   []
      quote:    str.substring(0, 1)
      [i, pi]:  [1, 1]
      while i < str.length - 1
        if starts str, '\\', i
          i += 1
        else if match: str.substring(i).match INTERPOLATION
          [group, interp]: match
          interp: "this.${ interp.substring(1) }" if starts interp, '@'
          tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
          tokens.push ['IDENTIFIER', interp]
          i += group.length - 1
          pi: i + 1
        else if (expr: @balanced_string str.substring(i), supress: false, ['${', '}'])
          tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
          inner: expr.substring(2, expr.length - 1)
          if inner.length
            nested: lexer.tokenize "($inner)", {rewrite: no, line: @line}
            nested.pop()
            tokens.push ['TOKENS', nested]
          else
            tokens.push ['STRING', "$quote$quote"]
          i += expr.length - 1
          pi: i + 1
        i += 1
      tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i and pi < str.length - 1
      for each, i in tokens
        if each[0] is 'TOKENS'
          @tokens: @tokens.concat each[1]
        else
          @token each[0], each[1]
        @token '+', '+' if i < tokens.length - 1

# Does a list include a value?
include: (list, value) ->
  list.indexOf(value) >= 0

# Peek at the beginning of a given string to see if it matches a sequence.
starts: (string, literal, start) ->
  string.substring(start, (start or 0) + literal.length) is literal

# Tokens which a regular expression will never immediately follow, but which
# a division operator might.
#
# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
#
# Our list is shorter, due to sans-parentheses method calls.
NOT_REGEX: [
  'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE'
]


a: new PartialLexer(/\d+\s+/.toString() + ') was a Happy Bunny.')
a.regex_token()
puts a.tokens.join '\n'
	sys: require('sys')
	Lexer: require('../lib/lexer').Lexer

	INTERPOLATION : /^\$([a-zA-Z_@]\w*)/

	class PartialLexer

	constructor: (chunk) ->
	@i: 0
	@chunk: chunk
	@tokens: []

	token: (tag, value) ->
	@tokens.push([tag, value])

	tag: -> false

	# Work area
	# ---------

	# Matches regular expression literals. Lexing regular expressions is difficult
	# to distinguish from division, so we borrow some basic heuristics from
	# JavaScript and Ruby.
	regex_token: ->
	return false unless regex: @balanced_token supress: true, ['/', '/']
	return false if regex.match /^\/\s+\|\n/
	return false if include NOT_REGEX, @tag()
	flags: ['i', 'm', 'g', 'y']
	regex += flags[index] while (index: flags.indexOf @chunk.substr regex.length, 1) >= 0
	if regex.indexOf '}' > regex.indexOf '${'
	[regex, flags]: regex.substring(1).split('/')
	@tokens: @tokens.concat [['(', '('], ['NEW', 'new'], ['IDENTIFIER', 'RegExp'], ['CALL_START', '(']]
	@interpolate_string "\"${regex.replace('\\', '\\\\')}\""
	@tokens: @tokens.concat [[',', ','], ['STRING', "'$flags'"], ['CALL_END', ')'], [')', ')']]
	else
	@token 'REGEX', regex
	@i += regex.length
	true

	# Used methods with no changes
	# ----------------------------

	# Matches a balanced group such as a single or double-quoted string. Pass in
	# a series of delimiters, all of which must be nested correctly within the
	# contents of the string. This method allows us to have strings within
	# interpolations within strings etc...
	balanced_string: (str, supress, delimited...) ->
	levels: []
	i: 0
	while i < str.length
	for pair in delimited
	[open, close]: pair
	if levels.length and starts str, '\\', i
	i += 1
	break
	else if levels.length and starts(str, close, i) and levels[levels.length - 1] is pair
	levels.pop()
	i += close.length - 1
	i += 1 unless levels.length
	break
	else if starts str, open, i
	levels.push(pair)
	i += open.length - 1
	break
	break unless levels.length
	i += 1
	if levels.length
	throw new Error "SyntaxError: Unterminated ${levels.pop()[0]} starting on line ${@line + 1}" unless supress
	return false
	return false if i is 0
	return str.substring(0, i)

	# Matches a token in which which the passed delimiter pairs must be correctly
	# balanced (ie. strings, JS literals).
	balanced_token: (supress, delimited...) ->
	@balanced_string @chunk, supress, delimited...

	# Expand variables and expressions inside double-quoted strings using
	# [ECMA Harmony's interpolation syntax](http://wiki.ecmascript.org/doku.php?id=strawman:string_interpolation)
	# for substitution of bare variables as well as arbitrary expressions.
	#
	# "Hello $name."
	# "Hello ${name.capitalize()}."
	#
	# If it encounters an interpolation, this method will recursively create a
	# new Lexer, tokenize the interpolated contents, and merge them into the
	# token stream.
	interpolate_string: (str) ->
	if str.length < 3 or not starts str, '"'
	@token 'STRING', str
	else
	lexer: new Lexer()
	tokens: []
	quote: str.substring(0, 1)
	[i, pi]: [1, 1]
	while i < str.length - 1
	if starts str, '\\', i
	i += 1
	else if match: str.substring(i).match INTERPOLATION
	[group, interp]: match
	interp: "this.${ interp.substring(1) }" if starts interp, '@'
	tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
	tokens.push ['IDENTIFIER', interp]
	i += group.length - 1
	pi: i + 1
	else if (expr: @balanced_string str.substring(i), supress: false, ['${', '}'])
	tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i
	inner: expr.substring(2, expr.length - 1)
	if inner.length
	nested: lexer.tokenize "($inner)", {rewrite: no, line: @line}
	nested.pop()
	tokens.push ['TOKENS', nested]
	else
	tokens.push ['STRING', "$quote$quote"]
	i += expr.length - 1
	pi: i + 1
	i += 1
	tokens.push ['STRING', "$quote${ str.substring(pi, i) }$quote"] if pi < i and pi < str.length - 1
	for each, i in tokens
	if each[0] is 'TOKENS'
	@tokens: @tokens.concat each[1]
	else
	@token each[0], each[1]
	@token '+', '+' if i < tokens.length - 1

	# Does a list include a value?
	include: (list, value) ->
	list.indexOf(value) >= 0

	# Peek at the beginning of a given string to see if it matches a sequence.
	starts: (string, literal, start) ->
	string.substring(start, (start or 0) + literal.length) is literal

	# Tokens which a regular expression will never immediately follow, but which
	# a division operator might.
	#
	# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
	#
	# Our list is shorter, due to sans-parentheses method calls.
	NOT_REGEX: [
	'NUMBER', 'REGEX', '++', '--', 'FALSE', 'NULL', 'TRUE'
	]


	a: new PartialLexer(/\d+\s+/.toString() + ') was a Happy Bunny.')
	a.regex_token()
	puts a.tokens.join '\n'