Last active
March 29, 2024 16:05
-
-
Save mansourmoufid/711b38e65a3ddd0863e4f61827a2e699 to your computer and use it in GitHub Desktop.
Add abbr tags to an HTML document using BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import functools | |
import re | |
# import sys | |
import typing | |
import bs4 | |
html = ''' | |
<html> | |
<head><title></title></head> | |
<body> | |
<p>U.S.</p> | |
<p>U.K.</p> | |
<p><a>U.S.</a></p> | |
<p><a>U.S.</a> and U.K.</p> | |
<p>U.S. and U.K.</p> | |
<p>U.K. and U.S.</p> | |
<p>U.S., U.K. and Canada.</p> | |
<p><abbr>U.S.</abbr></p> | |
<p>U.S.A.</p> | |
<p>N.U.S.</p> | |
</body> | |
</html> | |
''' | |
# html = sys.stdin | |
abbreviations = { | |
'U.S.': 'United States', | |
'U.K.': 'United Kingdom', | |
} | |
# def parse_abbreviations(lines): | |
# a, b = None, None | |
# for i, line in enumerate(lines): | |
# if line.startswith('#'): | |
# continue | |
# text = line.rstrip('\n') | |
# if len(text) == 0: | |
# yield a, b | |
# a, b = None, None | |
# elif a is None: | |
# a = text | |
# elif b is None: | |
# b = text | |
# | |
# with open('abbr.txt', 'rt') as f: | |
# abbreviations = dict(parse_abbreviations(f)) | |
def elements(x): | |
while x is not None: | |
if isinstance(x, bs4.NavigableString): | |
if len(x.string.strip('\n ')) > 0: | |
yield x | |
x = x.next_element | |
def join(delim, xs): | |
for i, x in enumerate(xs): | |
yield x | |
if i < len(xs) - 1: | |
if isinstance(delim, typing.Callable): | |
yield delim() | |
else: | |
yield delim | |
def new_abbr(abbr): | |
new = soup.new_tag('abbr') | |
if abbr in abbreviations: | |
new.attrs['title'] = abbreviations[abbr] | |
new.string = abbr | |
return new | |
soup = bs4.BeautifulSoup(html, 'html5lib') | |
for abbr in list(abbreviations): | |
# Match abbr in the string if it's not followed by a letter, | |
# and not preceded by a letter or period. | |
abbr_exp = r'(?<![\w.])' + re.escape(abbr) + r'(?!\w)' | |
# elements() in inner loop, or strange things happen | |
for e in list(elements(soup.find('body'))): | |
if isinstance(e, bs4.Comment): | |
continue | |
parent = e.parent | |
if parent.name == 'abbr': | |
# Avoid recursion | |
continue | |
if re.search(abbr_exp, str(e.string)): | |
i = parent.contents.index(e) | |
x = e.extract() | |
xs = list(re.split(abbr_exp, x.string)) | |
ys = join(functools.partial(new_abbr, abbr), xs) | |
for j, y in enumerate(ys): | |
parent.insert(i + j, y) | |
soup.smooth() | |
assert soup.find('head') is not None | |
assert soup.find('body') is not None | |
assert len(list(soup.head.children)) > 0 | |
assert len(list(soup.body.children)) > 0 | |
print(str(soup)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment