elpaso/gist:9227318

## gistfile1.py

# Set up regular expressions
re_words = re.compile(r'<.*?>|((?:\w[-\w]*|&.*?;)+)', re.U | re.S)
re_chars = re.compile(r'<.*?>|(.)', re.U | re.S)
re_tag = re.compile(r'<(/)?([^ ]+?)(?:(\s*/)| .*?)?>', re.S)
re_newlines = re.compile(r'\r\n|\r')  # Used in normalize_newlines
re_camel_case = re.compile(r'(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))')

def add_truncation_text(text, truncate=None):
    if truncate is None:
        truncate = pgettext(
            'String to return when truncating text',
            '%(truncated_text)s...')
    truncate = force_text(truncate)
    if '%(truncated_text)s' in truncate:
        return truncate % {'truncated_text': text}
    # The truncation text didn't contain the %(truncated_text)s string
    # replacement argument so just append it to the text.
    if text.endswith(truncate):
        # But don't append the truncation text if the current text already
        # ends in this.
        return text
    return '%s%s' % (text, truncate)


def html_words(text, length, truncate):
    """
    Truncates HTML to a certain number of words (not counting tags and
    comments). Closes opened tags if they were correctly closed in the
    given HTML.

    Newlines in the HTML are preserved.
    """
    if length <= 0:
        return ''
    html4_singlets = (
        'br', 'col', 'link', 'base', 'img',
        'param', 'area', 'hr', 'input'
    )
    # Count non-HTML words and keep note of open tags
    pos = 0
    end_text_pos = 0
    words = 0
    open_tags = []
    while words <= length:
        m = re_words.search(text, pos)
        if not m:
            # Checked through whole string
            break
        pos = m.end(0)
        if m.group(1):
            # It's an actual non-HTML word
            words += 1
            if words == length:
                end_text_pos = pos
            continue
        # Check for tag
        tag = re_tag.match(m.group(0))
        if not tag or end_text_pos:
            # Don't worry about non tags or tags after our truncate point
            continue
        closing_tag, tagname, self_closing = tag.groups()
        # Element names are always case-insensitive
        tagname = tagname.lower()
        if self_closing or tagname in html4_singlets:
            pass
        elif closing_tag:
            # Check for match in open tags list
            try:
                i = open_tags.index(tagname)
            except ValueError:
                pass
            else:
                # SGML: An end tag closes, back to the matching start tag,
                # all unclosed intervening start tags with omitted end tags
                open_tags = open_tags[i + 1:]
        else:
            # Add it to the start of the open tags list
            open_tags.insert(0, tagname)
    if words <= length:
        # Don't try to close tags if we don't need to truncate
        return text
    out = text[:end_text_pos]
    truncate_text = add_truncation_text('', truncate)
    # Close any tags still open
    for tag in open_tags:
        out += '</%s>' % tag
    if truncate_text:
        out += truncate_text
    # Reverse and add  tags
    for tag in reversed(open_tags):
        out += '<%s>' % tag
    # Add remaining part of the string
    out += text[end_text_pos:]
    # Return string
    return out

	# Set up regular expressions
	re_words = re.compile(r'<.?>\|((?:\w[-\w]\|&.*?;)+)', re.U \| re.S)
	re_chars = re.compile(r'<.*?>\|(.)', re.U \| re.S)
	re_tag = re.compile(r'<(/)?([^ ]+?)(?:(\s/)\| .?)?>', re.S)
	re_newlines = re.compile(r'\r\n\|\r') # Used in normalize_newlines
	re_camel_case = re.compile(r'(((?<=[a-z])[A-Z])\|([A-Z](?![A-Z]\|$)))')

	def add_truncation_text(text, truncate=None):
	if truncate is None:
	truncate = pgettext(
	'String to return when truncating text',
	'%(truncated_text)s...')
	truncate = force_text(truncate)
	if '%(truncated_text)s' in truncate:
	return truncate % {'truncated_text': text}
	# The truncation text didn't contain the %(truncated_text)s string
	# replacement argument so just append it to the text.
	if text.endswith(truncate):
	# But don't append the truncation text if the current text already
	# ends in this.
	return text
	return '%s%s' % (text, truncate)


	def html_words(text, length, truncate):
	"""
	Truncates HTML to a certain number of words (not counting tags and
	comments). Closes opened tags if they were correctly closed in the
	given HTML.

	Newlines in the HTML are preserved.
	"""
	if length <= 0:
	return ''
	html4_singlets = (
	'br', 'col', 'link', 'base', 'img',
	'param', 'area', 'hr', 'input'
	)
	# Count non-HTML words and keep note of open tags
	pos = 0
	end_text_pos = 0
	words = 0
	open_tags = []
	while words <= length:
	m = re_words.search(text, pos)
	if not m:
	# Checked through whole string
	break
	pos = m.end(0)
	if m.group(1):
	# It's an actual non-HTML word
	words += 1
	if words == length:
	end_text_pos = pos
	continue
	# Check for tag
	tag = re_tag.match(m.group(0))
	if not tag or end_text_pos:
	# Don't worry about non tags or tags after our truncate point
	continue
	closing_tag, tagname, self_closing = tag.groups()
	# Element names are always case-insensitive
	tagname = tagname.lower()
	if self_closing or tagname in html4_singlets:
	pass
	elif closing_tag:
	# Check for match in open tags list
	try:
	i = open_tags.index(tagname)
	except ValueError:
	pass
	else:
	# SGML: An end tag closes, back to the matching start tag,
	# all unclosed intervening start tags with omitted end tags
	open_tags = open_tags[i + 1:]
	else:
	# Add it to the start of the open tags list
	open_tags.insert(0, tagname)
	if words <= length:
	# Don't try to close tags if we don't need to truncate
	return text
	out = text[:end_text_pos]
	truncate_text = add_truncation_text('', truncate)
	# Close any tags still open
	for tag in open_tags:
	out += '</%s>' % tag
	if truncate_text:
	out += truncate_text
	# Reverse and add tags
	for tag in reversed(open_tags):
	out += '<%s>' % tag
	# Add remaining part of the string
	out += text[end_text_pos:]
	# Return string
	return out