Skip to content

Instantly share code, notes, and snippets.

@synth
Created September 4, 2022 01:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save synth/715fd15222395f0c6aae9ec65d69df05 to your computer and use it in GitHub Desktop.
Save synth/715fd15222395f0c6aae9ec65d69df05 to your computer and use it in GitHub Desktop.
Ruby port of truncation.js
# Ported from a node script that seemed to do truncation well
# https://github.com/huang47/nodejs-html-truncate/blob/master/lib/truncate.js
class Trunc
def self.trunc(str, max_length, options = {})
new(str, max_length, options).trunc
end
attr_reader :str
EMPTY_OBJECT = {}
EMPTY_STRING = ''
DEFAULT_TRUNCATE_SYMBOL = '...'
DEFAULT_SLOP = 10
EXCLUDE_TAGS = ['img', 'br'] # non-closed tags
KEY_VALUE_REGEX = '([\\w|-]+\\s*=\\s*"[^"]*"\\s*)*'
IS_CLOSE_REGEX = '\\s*\\/?\\s*'
CLOSE_REGEX = '\\s*\\/\\s*'
SELF_CLOSE_REGEX = Regexp.new('<\\/?\\w+\\s*' + KEY_VALUE_REGEX + CLOSE_REGEX + '>')
HTML_TAG_REGEX = Regexp.new('<\\/?\\w+\\s*' + KEY_VALUE_REGEX + IS_CLOSE_REGEX + '>')
URL_REGEX = Regexp.new('(((ftp|https?):\/\/)[\-\w@:%_\+.~#?,&\/\/=]+)|((mailto:)?[_.\w\-]+@([\w][\w\-]+\.)+[a-zA-Z]{2,3})') # Simple regexp
IMAGE_TAG_REGEX = Regexp.new('<img\\s*' + KEY_VALUE_REGEX + IS_CLOSE_REGEX + '>')
WORD_BREAK_REGEX = Regexp.new('\\W+', 'g')
def initialize(string, max_length, options = {})
@string = string
@max_length = max_length
@options = options
@items = [] # stack for saving tags
@total = 0 #record how many characters we traced so far
@content = EMPTY_STRING #truncated text storage
@tag
@selfClose
end
def trunc
string_copy = @string.dup
@options = Hashie::Mash.new(@options || EMPTY_OBJECT)
@options.ellipsis = (nil != @options.ellipsis) ? @options.ellipsis : DEFAULT_TRUNCATE_SYMBOL
@options.truncateLastWord = (nil != @options.truncateLastWord) ? @options.truncateLastWord : true
@options.slop = (nil != @options.slop) ? @options.slop : (DEFAULT_SLOP > @max_length ? @max_length : DEFAULT_SLOP)
matches = true
while (matches) do
matches = HTML_TAG_REGEX.match(string_copy)
matches_index = string_copy.index(HTML_TAG_REGEX)
if (!matches)
if (@total >= @max_length)
break
end
matches = URL_REGEX.match(string_copy)
matches_index = string_copy.index(URL_REGEX)
if (!matches || matches_index >= @max_length)
@content += string_copy[0, _getEndPosition(string_copy)]
break
end
while (matches) do
result = matches[0]
index = matches_index
@content += string_copy[0, (index + @result.length) - @total]
string_copy = string_copy[index + @result.length]
matches = URL_REGEX.match(string_copy)
matches_index = string_copy.index(URL_REGEX)
end
break
end
result = matches[0]
index = matches_index
if (@total + index > @max_length)
# exceed given `max_length`, dump everything to clear stack
@content += string_copy[0, _getEndPosition(string_copy, index)]
break
else
@total += index
@content += string_copy[0, index]
end
if ('/' === result[1])
log "Popping items"
# move out open tag
@items.pop()
@selfClose=nil
else
@selfClose = SELF_CLOSE_REGEX.match(result)
matches_index = string_copy.index(URL_REGEX)
if (!@selfClose)
@tag = _getTag(result)
log "Pushing item: #{@tag}"
@items.push(@tag)
end
end
if (@selfClose)
@content += @selfClose[0]
else
@content += result
end
string_copy = string_copy[index + result.length, string_copy.length]
end
if (string_copy.length > @max_length - @total && @options.ellipsis)
@content += @options.ellipsis
end
@content += _dumpCloseTag(@items)
if (!@options.keepImageTag)
@content = _removeImageTag(@content)
end
return @content
end
def _removeImageTag(string)
match = IMAGE_TAG_REGEX.match(string)
if (!match)
return string
end
@index = match.index
len = match[0].length
return string[0, @index] + string[@index + len]
end
def _dumpCloseTag(tags)
log "DumpCloseTag: #{tags}"
html = ''
tags.reverse.each do |tag, index|
#dump non-excluded tags only
if(nil === EXCLUDE_TAGS.index(tag))
html += '</' + tag + '>'
end
end
log "DumpCloseTagReturn: #{html}"
return html
end
def _getTag(string)
log "getTag: #{string}"
tail = string.index(' ')
# TODO:
# we have to figure out how to handle non-well-formatted HTML case
if (nil == tail)
tail = string.index('>')
if (nil == tail)
raise 'HTML tag is not well-formed : ' + string
end
end
log "getTagReturn: #{string[1, tail-1]}"
return string[1, tail-1]
end
def _getEndPosition (string, tailPos = nil)
defaultPos = @max_length - @total
position = defaultPos
isShort = defaultPos < @options.slop
slopPos = isShort ? defaultPos : @options.slop - 1
startSlice = isShort ? 0 : defaultPos - @options.slop
endSlice = tailPos || (defaultPos + @options.slop)
if (!@options.truncateLastWord)
substr = string.slice(startSlice, endSlice)
if (tailPos && substr.length <= tailPos)
position = substr.length
else
while ((@result = WORD_BREAK_REGEX.match(substr)) != null) do
# a natural break position before the hard break position
if (@result.index < slopPos)
position = defaultPos - (slopPos - @result.index)
# keep seeking closer to the hard break position
# unless a natural break is at position 0
if (@result.index === 0 && defaultPos <= 1)
break
end
# a natural break position exactly at the hard break position
elsif (@result.index === slopPos)
position = defaultPos
break # seek no more
# a natural break position after the hard break position
else
position = defaultPos + (@result.index - slopPos)
break # seek no more
end
end
end
if string.charAt(position - 1).match(/\s$/)
position -= 1
end
end
return position
end
def log(msg)
puts msg if @debug
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment