Created
August 25, 2014 13:56
-
-
Save ljdelight/7d1d7cfd151d866bd806 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from bs4 import BeautifulSoup, NavigableString | |
def modify_node(soup, node): | |
parent = node.parent | |
tokens = str(node.string).splitlines() | |
str_index = parent.index(node) | |
# splitlines will return a single item if there are not newlines. | |
# If the single value doesn't end in a newline, skip it. | |
if len(tokens)==1 and not tokens[0].endswith("\n"): | |
logging.debug("Skipping token, br not needed: %s" % (tokens[0])) | |
return | |
elif node.next_sibling and node.next_sibling.name == 'br': | |
logging.debug("FollowingSibling is a break, br not needed: %s" % (tokens[0])) | |
return | |
logging.debug("Index: %s" % (str_index)) | |
# remove the node from the parent. it's now disjoint from primary | |
# soup tree so DO NOT ref it | |
node.extract() | |
for token_ in tokens: | |
if token_.split(): | |
logging.debug("Adding token, br, newline: %s" % (token_)) | |
parent.insert(str_index, token_) | |
parent.insert(str_index+1, soup.new_tag("br")) | |
parent.insert(str_index+2, "\n") | |
str_index += 3 | |
def main(): | |
filename = "license_tidy.html" | |
logging.basicConfig(level=logging.INFO) | |
f_soup = None | |
with open(filename, "r", encoding="utf-8") as f_handle: | |
f_soup = BeautifulSoup(f_handle) | |
for span_ in f_soup.find_all(name="span"): | |
logging.debug("Found span") | |
# need to save the list of current children. we'll insert more children | |
# on this node so we MUST NOT iterate over a live reference. | |
children = list(span_.children) | |
for child_ in children: | |
logging.debug("Span children: %s" % (len(children))) | |
if isinstance(child_, NavigableString) and child_.string: | |
logging.debug("Modifying text node child") | |
modify_node(f_soup, child_) | |
# for p_ in f_soup.find_all(name="p"): | |
# for child_ in p_.children: | |
# if isinstance(child_, NavigableString) and child_.string: | |
# modify_node(f_soup, child_) | |
with open("output.html", "w", encoding="utf-8") as out: | |
out.write(f_soup.decode(formatter="html")) | |
#print(f_soup.decode(formatter="html")) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment