Skip to content

Instantly share code, notes, and snippets.

@ljdelight
Created August 25, 2014 13:56
Show Gist options
  • Save ljdelight/7d1d7cfd151d866bd806 to your computer and use it in GitHub Desktop.
Save ljdelight/7d1d7cfd151d866bd806 to your computer and use it in GitHub Desktop.
import logging
from bs4 import BeautifulSoup, NavigableString
def modify_node(soup, node):
parent = node.parent
tokens = str(node.string).splitlines()
str_index = parent.index(node)
# splitlines will return a single item if there are not newlines.
# If the single value doesn't end in a newline, skip it.
if len(tokens)==1 and not tokens[0].endswith("\n"):
logging.debug("Skipping token, br not needed: %s" % (tokens[0]))
return
elif node.next_sibling and node.next_sibling.name == 'br':
logging.debug("FollowingSibling is a break, br not needed: %s" % (tokens[0]))
return
logging.debug("Index: %s" % (str_index))
# remove the node from the parent. it's now disjoint from primary
# soup tree so DO NOT ref it
node.extract()
for token_ in tokens:
if token_.split():
logging.debug("Adding token, br, newline: %s" % (token_))
parent.insert(str_index, token_)
parent.insert(str_index+1, soup.new_tag("br"))
parent.insert(str_index+2, "\n")
str_index += 3
def main():
filename = "license_tidy.html"
logging.basicConfig(level=logging.INFO)
f_soup = None
with open(filename, "r", encoding="utf-8") as f_handle:
f_soup = BeautifulSoup(f_handle)
for span_ in f_soup.find_all(name="span"):
logging.debug("Found span")
# need to save the list of current children. we'll insert more children
# on this node so we MUST NOT iterate over a live reference.
children = list(span_.children)
for child_ in children:
logging.debug("Span children: %s" % (len(children)))
if isinstance(child_, NavigableString) and child_.string:
logging.debug("Modifying text node child")
modify_node(f_soup, child_)
# for p_ in f_soup.find_all(name="p"):
# for child_ in p_.children:
# if isinstance(child_, NavigableString) and child_.string:
# modify_node(f_soup, child_)
with open("output.html", "w", encoding="utf-8") as out:
out.write(f_soup.decode(formatter="html"))
#print(f_soup.decode(formatter="html"))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment