Skip to content

Instantly share code, notes, and snippets.

@clehner
Created July 26, 2010 04:21
Show Gist options
  • Save clehner/490174 to your computer and use it in GitHub Desktop.
Save clehner/490174 to your computer and use it in GitHub Desktop.
A better blip.append_content() for python wave robots.
from BeautifulSoup import BeautifulSoup, NavigableString
from waveapi import element
import re
import htmlentitydefs
IMAGE_PLACEHOLDER = '***{{((IMAGE_ELEMENT))}}***'
def append_content_to_blip(blip, content, type=None):
if type == 'text/plain':
# Replace characters that Wave breaks on
text = content.replace('\t', ' ').replace('\r', '\n')
blip.append(text)
return
imgs = []
# originally by Pamela Fox (Google)
# http://google-wave-resources.googlecode.com/svn/trunk/samples/extensions/robots/python/maildigester/handler.py
def cleanup(soup):
for tag in soup:
if not isinstance(tag, NavigableString):
if tag.name == 'img':
imgs.append({'url': tag.get('src'),
'width': tag.get('width'),
'height': tag.get('height')})
# replace it with an image element later
tag.replaceWith(IMAGE_PLACEHOLDER)
if tag.name == 'a':
tag['href'] = tag['href'].replace('&', '&')
cleanup(tag)
html = unescape(content)
soup = BeautifulSoup(html.strip())
cleanup(soup)
html = unicode(soup)
html = html.replace('\t', ' ')
# Since its HTML, it should use <br>s instead of line breaks.
html = html.replace('\r', '').replace('\n', '')
blip.append_markup(html)
# Because append_markup doesn't accept images, we replace img tags in the
# html with placeholders and then replace them with image elements.
for img in imgs:
image = element.Image(url=img['url'],
width=img['width'],
height=img['height'])
placeholder = blip.first(IMAGE_PLACEHOLDER)
# Image elements don't allow links on them.
# So insert an extra space after images so that a link can still
# be clicked if it would normally be on the image.
placeholder.insert_after(' ')
placeholder.replace(image)
def unescape(text):
'''
Replaces HTML entities with unicode characters
by Fredrik Lundh
http://effbot.org/zone/re-sub.htm#unescape-html
'''
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment