Skip to content

Instantly share code, notes, and snippets.

@elpaso
Created February 26, 2014 10:35
Show Gist options
  • Save elpaso/9227318 to your computer and use it in GitHub Desktop.
Save elpaso/9227318 to your computer and use it in GitHub Desktop.
Django truncate HTML with smart insert
# Set up regular expressions
re_words = re.compile(r'<.*?>|((?:\w[-\w]*|&.*?;)+)', re.U | re.S)
re_chars = re.compile(r'<.*?>|(.)', re.U | re.S)
re_tag = re.compile(r'<(/)?([^ ]+?)(?:(\s*/)| .*?)?>', re.S)
re_newlines = re.compile(r'\r\n|\r') # Used in normalize_newlines
re_camel_case = re.compile(r'(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))')
def add_truncation_text(text, truncate=None):
if truncate is None:
truncate = pgettext(
'String to return when truncating text',
'%(truncated_text)s...')
truncate = force_text(truncate)
if '%(truncated_text)s' in truncate:
return truncate % {'truncated_text': text}
# The truncation text didn't contain the %(truncated_text)s string
# replacement argument so just append it to the text.
if text.endswith(truncate):
# But don't append the truncation text if the current text already
# ends in this.
return text
return '%s%s' % (text, truncate)
def html_words(text, length, truncate):
"""
Truncates HTML to a certain number of words (not counting tags and
comments). Closes opened tags if they were correctly closed in the
given HTML.
Newlines in the HTML are preserved.
"""
if length <= 0:
return ''
html4_singlets = (
'br', 'col', 'link', 'base', 'img',
'param', 'area', 'hr', 'input'
)
# Count non-HTML words and keep note of open tags
pos = 0
end_text_pos = 0
words = 0
open_tags = []
while words <= length:
m = re_words.search(text, pos)
if not m:
# Checked through whole string
break
pos = m.end(0)
if m.group(1):
# It's an actual non-HTML word
words += 1
if words == length:
end_text_pos = pos
continue
# Check for tag
tag = re_tag.match(m.group(0))
if not tag or end_text_pos:
# Don't worry about non tags or tags after our truncate point
continue
closing_tag, tagname, self_closing = tag.groups()
# Element names are always case-insensitive
tagname = tagname.lower()
if self_closing or tagname in html4_singlets:
pass
elif closing_tag:
# Check for match in open tags list
try:
i = open_tags.index(tagname)
except ValueError:
pass
else:
# SGML: An end tag closes, back to the matching start tag,
# all unclosed intervening start tags with omitted end tags
open_tags = open_tags[i + 1:]
else:
# Add it to the start of the open tags list
open_tags.insert(0, tagname)
if words <= length:
# Don't try to close tags if we don't need to truncate
return text
out = text[:end_text_pos]
truncate_text = add_truncation_text('', truncate)
# Close any tags still open
for tag in open_tags:
out += '</%s>' % tag
if truncate_text:
out += truncate_text
# Reverse and add tags
for tag in reversed(open_tags):
out += '<%s>' % tag
# Add remaining part of the string
out += text[end_text_pos:]
# Return string
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment