ljdelight/htmlparser.py

## htmlparser.py

import logging
from bs4 import BeautifulSoup, NavigableString


def modify_node(soup, node):
    parent = node.parent
    tokens = str(node.string).splitlines()
    str_index = parent.index(node)

    # splitlines will return a single item if there are not newlines.
    #   If the single value doesn't end in a newline, skip it.
    if len(tokens)==1 and not tokens[0].endswith("\n"):
        logging.debug("Skipping token, br not needed: %s" % (tokens[0]))
        return
    elif node.next_sibling and node.next_sibling.name == 'br':
        logging.debug("FollowingSibling is a break, br not needed: %s" % (tokens[0]))
        return
    logging.debug("Index: %s" % (str_index))

    # remove the node from the parent. it's now disjoint from primary
    #    soup tree so DO NOT ref it
    node.extract()

    for token_ in tokens:
        if token_.split():
            logging.debug("Adding token, br, newline: %s" % (token_))
            parent.insert(str_index, token_)
            parent.insert(str_index+1, soup.new_tag("br"))
            parent.insert(str_index+2, "\n")
            str_index += 3


def main():
    filename = "license_tidy.html"

    logging.basicConfig(level=logging.INFO)
    f_soup = None
    with open(filename, "r", encoding="utf-8") as f_handle:
        f_soup = BeautifulSoup(f_handle)

    for span_ in f_soup.find_all(name="span"):
        logging.debug("Found span")
        # need to save the list of current children. we'll insert more children
        #   on this node so we MUST NOT iterate over a live reference.
        children = list(span_.children)
        for child_ in children:
            logging.debug("Span children: %s" % (len(children)))
            if isinstance(child_, NavigableString) and child_.string:
                logging.debug("Modifying text node child")
                modify_node(f_soup, child_)

    # for p_ in f_soup.find_all(name="p"):
    #     for child_ in p_.children:
    #         if isinstance(child_, NavigableString) and child_.string:
    #             modify_node(f_soup, child_)


    with open("output.html", "w", encoding="utf-8") as out:
        out.write(f_soup.decode(formatter="html"))
    #print(f_soup.decode(formatter="html"))


main()

	import logging
	from bs4 import BeautifulSoup, NavigableString


	def modify_node(soup, node):
	parent = node.parent
	tokens = str(node.string).splitlines()
	str_index = parent.index(node)

	# splitlines will return a single item if there are not newlines.
	# If the single value doesn't end in a newline, skip it.
	if len(tokens)==1 and not tokens[0].endswith("\n"):
	logging.debug("Skipping token, br not needed: %s" % (tokens[0]))
	return
	elif node.next_sibling and node.next_sibling.name == 'br':
	logging.debug("FollowingSibling is a break, br not needed: %s" % (tokens[0]))
	return
	logging.debug("Index: %s" % (str_index))

	# remove the node from the parent. it's now disjoint from primary
	# soup tree so DO NOT ref it
	node.extract()

	for token_ in tokens:
	if token_.split():
	logging.debug("Adding token, br, newline: %s" % (token_))
	parent.insert(str_index, token_)
	parent.insert(str_index+1, soup.new_tag("br"))
	parent.insert(str_index+2, "\n")
	str_index += 3


	def main():
	filename = "license_tidy.html"

	logging.basicConfig(level=logging.INFO)
	f_soup = None
	with open(filename, "r", encoding="utf-8") as f_handle:
	f_soup = BeautifulSoup(f_handle)

	for span_ in f_soup.find_all(name="span"):
	logging.debug("Found span")
	# need to save the list of current children. we'll insert more children
	# on this node so we MUST NOT iterate over a live reference.
	children = list(span_.children)
	for child_ in children:
	logging.debug("Span children: %s" % (len(children)))
	if isinstance(child_, NavigableString) and child_.string:
	logging.debug("Modifying text node child")
	modify_node(f_soup, child_)

	# for p_ in f_soup.find_all(name="p"):
	# for child_ in p_.children:
	# if isinstance(child_, NavigableString) and child_.string:
	# modify_node(f_soup, child_)


	with open("output.html", "w", encoding="utf-8") as out:
	out.write(f_soup.decode(formatter="html"))
	#print(f_soup.decode(formatter="html"))


	main()