Created
February 26, 2014 10:35
-
-
Save elpaso/9227318 to your computer and use it in GitHub Desktop.
Django truncate HTML with smart insert
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Set up regular expressions | |
re_words = re.compile(r'<.*?>|((?:\w[-\w]*|&.*?;)+)', re.U | re.S) | |
re_chars = re.compile(r'<.*?>|(.)', re.U | re.S) | |
re_tag = re.compile(r'<(/)?([^ ]+?)(?:(\s*/)| .*?)?>', re.S) | |
re_newlines = re.compile(r'\r\n|\r') # Used in normalize_newlines | |
re_camel_case = re.compile(r'(((?<=[a-z])[A-Z])|([A-Z](?![A-Z]|$)))') | |
def add_truncation_text(text, truncate=None): | |
if truncate is None: | |
truncate = pgettext( | |
'String to return when truncating text', | |
'%(truncated_text)s...') | |
truncate = force_text(truncate) | |
if '%(truncated_text)s' in truncate: | |
return truncate % {'truncated_text': text} | |
# The truncation text didn't contain the %(truncated_text)s string | |
# replacement argument so just append it to the text. | |
if text.endswith(truncate): | |
# But don't append the truncation text if the current text already | |
# ends in this. | |
return text | |
return '%s%s' % (text, truncate) | |
def html_words(text, length, truncate): | |
""" | |
Truncates HTML to a certain number of words (not counting tags and | |
comments). Closes opened tags if they were correctly closed in the | |
given HTML. | |
Newlines in the HTML are preserved. | |
""" | |
if length <= 0: | |
return '' | |
html4_singlets = ( | |
'br', 'col', 'link', 'base', 'img', | |
'param', 'area', 'hr', 'input' | |
) | |
# Count non-HTML words and keep note of open tags | |
pos = 0 | |
end_text_pos = 0 | |
words = 0 | |
open_tags = [] | |
while words <= length: | |
m = re_words.search(text, pos) | |
if not m: | |
# Checked through whole string | |
break | |
pos = m.end(0) | |
if m.group(1): | |
# It's an actual non-HTML word | |
words += 1 | |
if words == length: | |
end_text_pos = pos | |
continue | |
# Check for tag | |
tag = re_tag.match(m.group(0)) | |
if not tag or end_text_pos: | |
# Don't worry about non tags or tags after our truncate point | |
continue | |
closing_tag, tagname, self_closing = tag.groups() | |
# Element names are always case-insensitive | |
tagname = tagname.lower() | |
if self_closing or tagname in html4_singlets: | |
pass | |
elif closing_tag: | |
# Check for match in open tags list | |
try: | |
i = open_tags.index(tagname) | |
except ValueError: | |
pass | |
else: | |
# SGML: An end tag closes, back to the matching start tag, | |
# all unclosed intervening start tags with omitted end tags | |
open_tags = open_tags[i + 1:] | |
else: | |
# Add it to the start of the open tags list | |
open_tags.insert(0, tagname) | |
if words <= length: | |
# Don't try to close tags if we don't need to truncate | |
return text | |
out = text[:end_text_pos] | |
truncate_text = add_truncation_text('', truncate) | |
# Close any tags still open | |
for tag in open_tags: | |
out += '</%s>' % tag | |
if truncate_text: | |
out += truncate_text | |
# Reverse and add tags | |
for tag in reversed(open_tags): | |
out += '<%s>' % tag | |
# Add remaining part of the string | |
out += text[end_text_pos:] | |
# Return string | |
return out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment