Skip to content

Instantly share code, notes, and snippets.

@mansourmoufid
Last active March 29, 2024 16:05
Show Gist options
  • Save mansourmoufid/711b38e65a3ddd0863e4f61827a2e699 to your computer and use it in GitHub Desktop.
Save mansourmoufid/711b38e65a3ddd0863e4f61827a2e699 to your computer and use it in GitHub Desktop.
Add abbr tags to an HTML document using BeautifulSoup
import functools
import re
# import sys
import typing
import bs4
html = '''
<html>
<head><title></title></head>
<body>
<p>U.S.</p>
<p>U.K.</p>
<p><a>U.S.</a></p>
<p><a>U.S.</a> and U.K.</p>
<p>U.S. and U.K.</p>
<p>U.K. and U.S.</p>
<p>U.S., U.K. and Canada.</p>
<p><abbr>U.S.</abbr></p>
<p>U.S.A.</p>
<p>N.U.S.</p>
</body>
</html>
'''
# html = sys.stdin
abbreviations = {
'U.S.': 'United States',
'U.K.': 'United Kingdom',
}
# def parse_abbreviations(lines):
# a, b = None, None
# for i, line in enumerate(lines):
# if line.startswith('#'):
# continue
# text = line.rstrip('\n')
# if len(text) == 0:
# yield a, b
# a, b = None, None
# elif a is None:
# a = text
# elif b is None:
# b = text
#
# with open('abbr.txt', 'rt') as f:
# abbreviations = dict(parse_abbreviations(f))
def elements(x):
while x is not None:
if isinstance(x, bs4.NavigableString):
if len(x.string.strip('\n ')) > 0:
yield x
x = x.next_element
def join(delim, xs):
for i, x in enumerate(xs):
yield x
if i < len(xs) - 1:
if isinstance(delim, typing.Callable):
yield delim()
else:
yield delim
def new_abbr(abbr):
new = soup.new_tag('abbr')
if abbr in abbreviations:
new.attrs['title'] = abbreviations[abbr]
new.string = abbr
return new
soup = bs4.BeautifulSoup(html, 'html5lib')
for abbr in list(abbreviations):
# Match abbr in the string if it's not followed by a letter,
# and not preceded by a letter or period.
abbr_exp = r'(?<![\w.])' + re.escape(abbr) + r'(?!\w)'
# elements() in inner loop, or strange things happen
for e in list(elements(soup.find('body'))):
if isinstance(e, bs4.Comment):
continue
parent = e.parent
if parent.name == 'abbr':
# Avoid recursion
continue
if re.search(abbr_exp, str(e.string)):
i = parent.contents.index(e)
x = e.extract()
xs = list(re.split(abbr_exp, x.string))
ys = join(functools.partial(new_abbr, abbr), xs)
for j, y in enumerate(ys):
parent.insert(i + j, y)
soup.smooth()
assert soup.find('head') is not None
assert soup.find('body') is not None
assert len(list(soup.head.children)) > 0
assert len(list(soup.body.children)) > 0
print(str(soup))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment