Skip to content

Instantly share code, notes, and snippets.

@dcrosta
Created November 4, 2011 17:50
Show Gist options
  • Save dcrosta/1339994 to your computer and use it in GitHub Desktop.
Save dcrosta/1339994 to your computer and use it in GitHub Desktop.
truncates HTML without screwing up tag nesting
import re
from HTMLParser import HTMLParser
whitespace = re.compile('(\w+)')
class HTMLAbbrev(HTMLParser):
def __init__(self, maxlength, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
self.stack = []
self.maxlength = maxlength
self.length = 0
self.done = False
self.out = []
def emit(self, thing, count=False):
if count:
self.length += len(thing)
if self.length < self.maxlength:
self.out.append(thing)
elif not self.done:
# trim trailing whitespace
self.out[-1] = self.out[-1].rstrip()
# close out tags on the stack
for tag in reversed(self.stack):
self.out.append('</%s>' % tag)
self.done = True
def handle_starttag(self, tag, attrs):
self.stack.append(tag)
attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs)
self.emit('<%s%s>' % (tag, (' ' + attrs).rstrip()))
def handle_endtag(self, tag):
if tag == self.stack[-1]:
self.emit('</%s>' % tag)
del self.stack[-1]
else:
raise Exception('end tag %r does not match stack: %r' % (tag, self.stack))
def handle_startendtag(self, tag, attrs):
self.stack.append(tag)
attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs)
self.emit('<%s%s/>' % (tag, (' ' + attrs).rstrip()))
def handle_data(self, data):
for word in whitespace.split(data):
self.emit(word, count=True)
def handle_entityref(self, name):
self.emit('&%s;' % name)
def handle_charref(self, name):
return self.handle_entityref('#%s' % name)
def close(self):
return ''.join(self.out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment