mansourmoufid/bs4-abbr.py

## bs4-abbr.py
import functools
import re
# import sys
import typing

import bs4


html = '''
    <html>
    <head><title></title></head>
    <body>
        <p>U.S.</p>
        <p>U.K.</p>
        <p><a>U.S.</a></p>
        <p><a>U.S.</a> and U.K.</p>
        <p>U.S. and U.K.</p>
        <p>U.K. and U.S.</p>
        <p>U.S., U.K. and Canada.</p>
        <p><abbr>U.S.</abbr></p>
        <p>U.S.A.</p>
        <p>N.U.S.</p>
    </body>
    </html>
'''
# html = sys.stdin

abbreviations = {
    'U.S.': 'United States',
    'U.K.': 'United Kingdom',
}
# def parse_abbreviations(lines):
#     a, b = None, None
#     for i, line in enumerate(lines):
#         if line.startswith('#'):
#             continue
#         text = line.rstrip('\n')
#         if len(text) == 0:
#             yield a, b
#             a, b = None, None
#         elif a is None:
#             a = text
#         elif b is None:
#             b = text
#
# with open('abbr.txt', 'rt') as f:
#     abbreviations = dict(parse_abbreviations(f))


def elements(x):
    while x is not None:
        if isinstance(x, bs4.NavigableString):
            if len(x.string.strip('\n ')) > 0:
                yield x
        x = x.next_element


def join(delim, xs):
    for i, x in enumerate(xs):
        yield x
        if i < len(xs) - 1:
            if isinstance(delim, typing.Callable):
                yield delim()
            else:
                yield delim


def new_abbr(abbr):
    new = soup.new_tag('abbr')
    if abbr in abbreviations:
        new.attrs['title'] = abbreviations[abbr]
    new.string = abbr
    return new


soup = bs4.BeautifulSoup(html, 'html5lib')

for abbr in list(abbreviations):

    # Match abbr in the string if it's not followed by a letter,
    # and not preceded by a letter or period.
    abbr_exp = r'(?<![\w.])' + re.escape(abbr) + r'(?!\w)'

    # elements() in inner loop, or strange things happen
    for e in list(elements(soup.find('body'))):

        if isinstance(e, bs4.Comment):
            continue

        parent = e.parent

        if parent.name == 'abbr':
            # Avoid recursion
            continue

        if re.search(abbr_exp, str(e.string)):

            i = parent.contents.index(e)

            x = e.extract()

            xs = list(re.split(abbr_exp, x.string))
            ys = join(functools.partial(new_abbr, abbr), xs)
            for j, y in enumerate(ys):
                parent.insert(i + j, y)

            soup.smooth()

assert soup.find('head') is not None
assert soup.find('body') is not None
assert len(list(soup.head.children)) > 0
assert len(list(soup.body.children)) > 0

print(str(soup))
	import functools
	import re
	# import sys
	import typing

	import bs4


	html = '''
	<html>
	<head><title></title></head>
	<body>
	<p>U.S.</p>
	<p>U.K.</p>
	<p><a>U.S.</a></p>
	<p><a>U.S.</a> and U.K.</p>
	<p>U.S. and U.K.</p>
	<p>U.K. and U.S.</p>
	<p>U.S., U.K. and Canada.</p>
	<p><abbr>U.S.</abbr></p>
	<p>U.S.A.</p>
	<p>N.U.S.</p>
	</body>
	</html>
	'''
	# html = sys.stdin

	abbreviations = {
	'U.S.': 'United States',
	'U.K.': 'United Kingdom',
	}
	# def parse_abbreviations(lines):
	# a, b = None, None
	# for i, line in enumerate(lines):
	# if line.startswith('#'):
	# continue
	# text = line.rstrip('\n')
	# if len(text) == 0:
	# yield a, b
	# a, b = None, None
	# elif a is None:
	# a = text
	# elif b is None:
	# b = text
	#
	# with open('abbr.txt', 'rt') as f:
	# abbreviations = dict(parse_abbreviations(f))


	def elements(x):
	while x is not None:
	if isinstance(x, bs4.NavigableString):
	if len(x.string.strip('\n ')) > 0:
	yield x
	x = x.next_element


	def join(delim, xs):
	for i, x in enumerate(xs):
	yield x
	if i < len(xs) - 1:
	if isinstance(delim, typing.Callable):
	yield delim()
	else:
	yield delim


	def new_abbr(abbr):
	new = soup.new_tag('abbr')
	if abbr in abbreviations:
	new.attrs['title'] = abbreviations[abbr]
	new.string = abbr
	return new


	soup = bs4.BeautifulSoup(html, 'html5lib')

	for abbr in list(abbreviations):

	# Match abbr in the string if it's not followed by a letter,
	# and not preceded by a letter or period.
	abbr_exp = r'(?<![\w.])' + re.escape(abbr) + r'(?!\w)'

	# elements() in inner loop, or strange things happen
	for e in list(elements(soup.find('body'))):

	if isinstance(e, bs4.Comment):
	continue

	parent = e.parent

	if parent.name == 'abbr':
	# Avoid recursion
	continue

	if re.search(abbr_exp, str(e.string)):

	i = parent.contents.index(e)

	x = e.extract()

	xs = list(re.split(abbr_exp, x.string))
	ys = join(functools.partial(new_abbr, abbr), xs)
	for j, y in enumerate(ys):
	parent.insert(i + j, y)

	soup.smooth()

	assert soup.find('head') is not None
	assert soup.find('body') is not None
	assert len(list(soup.head.children)) > 0
	assert len(list(soup.body.children)) > 0

	print(str(soup))