Skip to content

Instantly share code, notes, and snippets.

@averagehuman
Last active April 13, 2017 16:51
Show Gist options
  • Save averagehuman/0ea1455c2fe76c0b5eac to your computer and use it in GitHub Desktop.
Save averagehuman/0ea1455c2fe76c0b5eac to your computer and use it in GitHub Desktop.
Markup twitter hashtags and usernames within a html fragment
#!/usr/bin/env python
"""
The twitter-text-python library (https://pypi.python.org/pypi/twitter-text-python) can be used
to urlify text containing @<username>s and #<hashtag>s. It is a bit trickier if you want to do
the same with HTML, but BeautifulSoup makes it straightforward.
"""
from bs4 import BeautifulSoup, NavigableString
from ttp import ttp
parse_text = ttp.Parser().parse
#non-exhaustive list of tags you want to leave as is
EXCLUDE_TAGS = frozenset(['a', 'style', 'script', 'title', 'link'])
def transform_html(html):
"""Markup twitter hashtags and usernames within a html fragment
>>> fragment = '<p>my name is @SorenKQuotes and my site is <a href="http://site.com/index.html#link">here</a></p>'
>>> for a in BeautifulSoup(fragment).findAll('a'):
... print(a)
<a href="http://site.com/index.html#link">here</a>
>>> fragment = transform_html(fragment)
>>> for a in BeautifulSoup(fragment).findAll('a'):
... print(a)
<a href="https://twitter.com/SorenKQuotes">@SorenKQuotes</a>
<a href="http://site.com/index.html#link">here</a>
"""
soup = BeautifulSoup(html)
for tag in soup.findAll():
if tag.name.lower() in EXCLUDE_TAGS:
continue
for child in tag.contents:
if isinstance(child, NavigableString):
child.replace_with(
BeautifulSoup(parse_text(child).html)
)
return unicode(soup)
if __name__ == '__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment