synth/truncation.rb

## truncation.rb
# Ported from a node script that seemed to do truncation well
# https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js
class Trunc
  def self.trunc(str, max_length, options = {})
    new(str, max_length, options).trunc
  end

  attr_reader :str

  EMPTY_OBJECT = {}
  EMPTY_STRING = ''
  DEFAULT_TRUNCATE_SYMBOL = '...'
  DEFAULT_SLOP = 10
  EXCLUDE_TAGS = ['img', 'br']   # non-closed tags
  KEY_VALUE_REGEX = '([\\w|-]+\\s*=\\s*"[^"]*"\\s*)*'
  IS_CLOSE_REGEX = '\\s*\\/?\\s*'
  CLOSE_REGEX = '\\s*\\/\\s*'
  SELF_CLOSE_REGEX = Regexp.new('<\\/?\\w+\\s*' + KEY_VALUE_REGEX + CLOSE_REGEX + '>')
  HTML_TAG_REGEX = Regexp.new('<\\/?\\w+\\s*' + KEY_VALUE_REGEX + IS_CLOSE_REGEX + '>')
  URL_REGEX = Regexp.new('(((ftp|https?):\/\/)[\-\w@:%_\+.~#?,&\/\/=]+)|((mailto:)?[_.\w\-]+@([\w][\w\-]+\.)+[a-zA-Z]{2,3})') # Simple regexp
  IMAGE_TAG_REGEX = Regexp.new('<img\\s*' + KEY_VALUE_REGEX + IS_CLOSE_REGEX + '>')
  WORD_BREAK_REGEX = Regexp.new('\\W+', 'g')

  def initialize(string, max_length, options = {})
    @string = string
    @max_length = max_length
    @options = options
    @items = []                     # stack for saving tags
    @total = 0                      #record how many characters we traced so far
    @content = EMPTY_STRING         #truncated text storage
    @tag
    @selfClose
  end

  def trunc
    string_copy = @string.dup
    @options = Hashie::Mash.new(@options || EMPTY_OBJECT)
    @options.ellipsis = (nil != @options.ellipsis) ? @options.ellipsis : DEFAULT_TRUNCATE_SYMBOL
    @options.truncateLastWord = (nil != @options.truncateLastWord) ? @options.truncateLastWord : true
    @options.slop = (nil != @options.slop) ? @options.slop : (DEFAULT_SLOP > @max_length ? @max_length : DEFAULT_SLOP)

    matches = true
    while (matches) do
      matches = HTML_TAG_REGEX.match(string_copy)
      matches_index = string_copy.index(HTML_TAG_REGEX)

      if (!matches)
        if (@total >= @max_length)
          break
        end

        matches = URL_REGEX.match(string_copy)
        matches_index = string_copy.index(URL_REGEX)
        if (!matches || matches_index >= @max_length)
          @content += string_copy[0, _getEndPosition(string_copy)]
          break
        end

        while (matches) do
          result = matches[0]
          index = matches_index
          @content += string_copy[0, (index + @result.length) - @total]
          string_copy = string_copy[index + @result.length]
          matches = URL_REGEX.match(string_copy)
          matches_index = string_copy.index(URL_REGEX)
        end
        break
      end

      result = matches[0]
      index = matches_index

      if (@total + index > @max_length)
        # exceed given `max_length`, dump everything to clear stack
        @content += string_copy[0, _getEndPosition(string_copy, index)]
        break
      else
        @total += index
        @content += string_copy[0, index]
      end

      if ('/' === result[1])
        log "Popping items"
        # move out open tag
        @items.pop()
        @selfClose=nil
      else
        @selfClose = SELF_CLOSE_REGEX.match(result)
        matches_index = string_copy.index(URL_REGEX)
        if (!@selfClose)
          @tag = _getTag(result)
          log "Pushing item: #{@tag}"
          @items.push(@tag)
        end
      end

      if (@selfClose)
        @content += @selfClose[0]
      else
        @content += result
      end
      string_copy = string_copy[index + result.length, string_copy.length]
    end

    if (string_copy.length > @max_length - @total && @options.ellipsis)
      @content += @options.ellipsis
    end
    @content += _dumpCloseTag(@items)

    if (!@options.keepImageTag)
      @content = _removeImageTag(@content)
    end

    return @content
  end

  def _removeImageTag(string)
    match = IMAGE_TAG_REGEX.match(string)

    if (!match)
      return string
    end

    @index = match.index
    len = match[0].length

    return string[0, @index] + string[@index + len]
  end

  def _dumpCloseTag(tags)
    log "DumpCloseTag: #{tags}"
    html = ''

    tags.reverse.each do |tag, index|
        #dump non-excluded tags only
      if(nil === EXCLUDE_TAGS.index(tag))
        html += '</' + tag + '>'
      end
    end

    log "DumpCloseTagReturn: #{html}"
    return html
  end

  def _getTag(string)
    log "getTag: #{string}"
    tail = string.index(' ')

    # TODO:
    # we have to figure out how to handle non-well-formatted HTML case
    if (nil == tail)
      tail = string.index('>')
      if (nil == tail)
        raise 'HTML tag is not well-formed : ' + string
      end
    end

    log "getTagReturn: #{string[1, tail-1]}"
    return string[1, tail-1]
  end

  def _getEndPosition (string, tailPos = nil)
    defaultPos = @max_length - @total
    position = defaultPos
    isShort = defaultPos < @options.slop
    slopPos = isShort ? defaultPos : @options.slop - 1
    startSlice = isShort ? 0 : defaultPos - @options.slop
    endSlice = tailPos || (defaultPos + @options.slop)

    if (!@options.truncateLastWord)
      substr = string.slice(startSlice, endSlice)

      if (tailPos && substr.length <= tailPos)
          position = substr.length
      else
        while ((@result = WORD_BREAK_REGEX.match(substr)) != null) do
          # a natural break position before the hard break position
          if (@result.index < slopPos)
            position = defaultPos - (slopPos - @result.index)
            # keep seeking closer to the hard break position
            # unless a natural break is at position 0
            if (@result.index === 0 && defaultPos <= 1)
              break
            end
          # a natural break position exactly at the hard break position
          elsif (@result.index === slopPos)
            position = defaultPos
            break # seek no more
          # a natural break position after the hard break position
          else
            position = defaultPos + (@result.index - slopPos)
            break  # seek no more
          end
        end
      end

      if string.charAt(position - 1).match(/\s$/)
        position -= 1
      end
    end
    return position
  end

  def log(msg)
    puts msg if @debug
  end
end
	# Ported from a node script that seemed to do truncation well
	# https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js
	class Trunc
	def self.trunc(str, max_length, options = {})
	new(str, max_length, options).trunc
	end

	attr_reader :str

	EMPTY_OBJECT = {}
	EMPTY_STRING = ''
	DEFAULT_TRUNCATE_SYMBOL = '...'
	DEFAULT_SLOP = 10
	EXCLUDE_TAGS = ['img', 'br'] # non-closed tags
	KEY_VALUE_REGEX = '([\\w\|-]+\\s=\\s"[^"]"\\s)*'
	IS_CLOSE_REGEX = '\\s\\/?\\s'
	CLOSE_REGEX = '\\s\\/\\s'
	SELF_CLOSE_REGEX = Regexp.new('<\\/?\\w+\\s*' + KEY_VALUE_REGEX + CLOSE_REGEX + '>')
	HTML_TAG_REGEX = Regexp.new('<\\/?\\w+\\s*' + KEY_VALUE_REGEX + IS_CLOSE_REGEX + '>')
	URL_REGEX = Regexp.new('(((ftp\|https?):\/\/)[\-\w@:%_\+.~#?,&\/\/=]+)\|((mailto:)?[_.\w\-]+@([\w][\w\-]+\.)+[a-zA-Z]{2,3})') # Simple regexp
	IMAGE_TAG_REGEX = Regexp.new('<img\\s*' + KEY_VALUE_REGEX + IS_CLOSE_REGEX + '>')
	WORD_BREAK_REGEX = Regexp.new('\\W+', 'g')

	def initialize(string, max_length, options = {})
	@string = string
	@max_length = max_length
	@options = options
	@items = [] # stack for saving tags
	@total = 0 #record how many characters we traced so far
	@content = EMPTY_STRING #truncated text storage
	@tag
	@selfClose
	end

	def trunc
	string_copy = @string.dup
	@options = Hashie::Mash.new(@options \|\| EMPTY_OBJECT)
	@options.ellipsis = (nil != @options.ellipsis) ? @options.ellipsis : DEFAULT_TRUNCATE_SYMBOL
	@options.truncateLastWord = (nil != @options.truncateLastWord) ? @options.truncateLastWord : true
	@options.slop = (nil != @options.slop) ? @options.slop : (DEFAULT_SLOP > @max_length ? @max_length : DEFAULT_SLOP)

	matches = true
	while (matches) do
	matches = HTML_TAG_REGEX.match(string_copy)
	matches_index = string_copy.index(HTML_TAG_REGEX)

	if (!matches)
	if (@total >= @max_length)
	break
	end

	matches = URL_REGEX.match(string_copy)
	matches_index = string_copy.index(URL_REGEX)
	if (!matches \|\| matches_index >= @max_length)
	@content += string_copy[0, _getEndPosition(string_copy)]
	break
	end

	while (matches) do
	result = matches[0]
	index = matches_index
	@content += string_copy[0, (index + @result.length) - @total]
	string_copy = string_copy[index + @result.length]
	matches = URL_REGEX.match(string_copy)
	matches_index = string_copy.index(URL_REGEX)
	end
	break
	end

	result = matches[0]
	index = matches_index

	if (@total + index > @max_length)
	# exceed given `max_length`, dump everything to clear stack
	@content += string_copy[0, _getEndPosition(string_copy, index)]
	break
	else
	@total += index
	@content += string_copy[0, index]
	end

	if ('/' === result[1])
	log "Popping items"
	# move out open tag
	@items.pop()
	@selfClose=nil
	else
	@selfClose = SELF_CLOSE_REGEX.match(result)
	matches_index = string_copy.index(URL_REGEX)
	if (!@selfClose)
	@tag = _getTag(result)
	log "Pushing item: #{@tag}"
	@items.push(@tag)
	end
	end

	if (@selfClose)
	@content += @selfClose[0]
	else
	@content += result
	end
	string_copy = string_copy[index + result.length, string_copy.length]
	end

	if (string_copy.length > @max_length - @total && @options.ellipsis)
	@content += @options.ellipsis
	end
	@content += _dumpCloseTag(@items)

	if (!@options.keepImageTag)
	@content = _removeImageTag(@content)
	end

	return @content
	end

	def _removeImageTag(string)
	match = IMAGE_TAG_REGEX.match(string)

	if (!match)
	return string
	end

	@index = match.index
	len = match[0].length

	return string[0, @index] + string[@index + len]
	end

	def _dumpCloseTag(tags)
	log "DumpCloseTag: #{tags}"
	html = ''

	tags.reverse.each do \|tag, index\|
	#dump non-excluded tags only
	if(nil === EXCLUDE_TAGS.index(tag))
	html += '</' + tag + '>'
	end
	end

	log "DumpCloseTagReturn: #{html}"
	return html
	end

	def _getTag(string)
	log "getTag: #{string}"
	tail = string.index(' ')

	# TODO:
	# we have to figure out how to handle non-well-formatted HTML case
	if (nil == tail)
	tail = string.index('>')
	if (nil == tail)
	raise 'HTML tag is not well-formed : ' + string
	end
	end

	log "getTagReturn: #{string[1, tail-1]}"
	return string[1, tail-1]
	end

	def _getEndPosition (string, tailPos = nil)
	defaultPos = @max_length - @total
	position = defaultPos
	isShort = defaultPos < @options.slop
	slopPos = isShort ? defaultPos : @options.slop - 1
	startSlice = isShort ? 0 : defaultPos - @options.slop
	endSlice = tailPos \|\| (defaultPos + @options.slop)

	if (!@options.truncateLastWord)
	substr = string.slice(startSlice, endSlice)

	if (tailPos && substr.length <= tailPos)
	position = substr.length
	else
	while ((@result = WORD_BREAK_REGEX.match(substr)) != null) do
	# a natural break position before the hard break position
	if (@result.index < slopPos)
	position = defaultPos - (slopPos - @result.index)
	# keep seeking closer to the hard break position
	# unless a natural break is at position 0
	if (@result.index === 0 && defaultPos <= 1)
	break
	end
	# a natural break position exactly at the hard break position
	elsif (@result.index === slopPos)
	position = defaultPos
	break # seek no more
	# a natural break position after the hard break position
	else
	position = defaultPos + (@result.index - slopPos)
	break # seek no more
	end
	end
	end

	if string.charAt(position - 1).match(/\s$/)
	position -= 1
	end
	end
	return position
	end

	def log(msg)
	puts msg if @debug
	end
	end