Skip to content

Instantly share code, notes, and snippets.

@randomradio
Last active July 9, 2018 05:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save randomradio/153161b135707288714dffa01b0046db to your computer and use it in GitHub Desktop.
Save randomradio/153161b135707288714dffa01b0046db to your computer and use it in GitHub Desktop.
Google document cleanup
import bs4
import cssutils
import logging
import urlparse
class HTML_cleaner(object):
soup = None;
highlighting_selectors = {}
def __init__(self, html_str, parser="html.parser"):
self.soup = bs4.BeautifulSoup(html_str, parser)
def clean_up(self, steps=[]):
""" Cleaup a beautiful soup object with given steps, return cleaned up soup
kwargs:
steps -- steps defined for cleanup, if empty function will run through all steps
"""
if not steps:
self.collect_styles()
self.remove_unused_classes(preserved=self.highlighting_selectors.keys())
self.remove_empty_elements()
self.fix_heading_strongs()
self.unwrap_valina_spans()
self.remove_link_redirects()
else:
for step in steps:
try:
self.soup = getattr(HTML_cleaner, step)
except AttributeError:
logging.warn("Step not found")
return self.soup
def collect_styles(self):
""" find all highlighted text in document and remove style tags
return class name and hex color dictionary
"""
highlighting_selectors = {}
for style_tag in self.soup.find_all('style', type='text/css'):
# print "STYLE_TAG: {}".format(style_tag)
if not style_tag.string:
continue
# print "STYLE_TAG.STRING: {}".format(style_tag.string)
sheet = cssutils.parseString(style_tag.string)
for rule in sheet:
# print "RULE: {}".format(rule)
if rule.type == cssutils.css.CSSRule.STYLE_RULE:
for selector in rule.selectorList:
if len(selector.selectorText.split(',')) == 1 and rule.style.length == 1:
if rule.style.keys()[0] == 'background-color':
key_name = rule.style.keys()[0]
hex_rule = rule.style.getProperty(key_name)
highlighting_selectors[selector.selectorText.split(',')[0]] = hex_rule.value
style_tag.decompose()
self.highlighting_selectors = highlighting_selectors
return highlighting_selectors
def remove_ids(self):
pass
def remove_unused_classes(self, preserved=[]):
for tag in self.soup.findAll(True):
if 'class' in tag.attrs.keys():
tag_classes = tag.attrs['class']
intersect_class_name = filter(lambda x: '.%s'%x in preserved, tag_classes)
if len(intersect_class_name) == 0:
del tag['class']
def remove_empty_elements(self):
for el in self.soup.find_all():
if len(el.get_text().strip()) == 0:
el.extract()
if el.is_empty_tag:
el.extract()
def unwrap_valina_spans(self):
spans = self.soup.find_all('span')
for span in spans:
if 'class' not in span.attrs.keys():
span.unwrap()
def remove_link_redirects(self):
a_tags = self.soup.find_all('a')
for a in a_tags:
href = a.get('href')
if href.startswith('https://www.google.com/url?q='):
parsed = urlparse.urlparse(href)
parsed_q = urlparse.parse_qs(parsed.query)['q']
if len(parsed_q) > 0:
href = parsed_q[0]
a['href'] = href
def fix_heading_strongs(self):
"""If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight."""
headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# Double next strong tags often appear. Repeat until no results found.
stripped = True
while stripped:
stripped = False
for heading in headings:
if len(heading.contents) == 1 and heading.contents[0].name == 'strong':
heading.contents[0].unwrap()
stripped = True
import bs4
import cssutils
import urlparse
import logging
from HTMLParser import HTMLParser
ALLOWED_EMPTY_TAGS = ['td', 'br']
class BaseSanitizer(object):
def __init__(self, raw_html):
raw_html = '' if raw_html is None else raw_html
raw_html = raw_html.replace(' ', ' ')
raw_html = raw_html.replace('<br>', '<br/>')
# # try to unescape content before creating soup
# try:
# html_parser = HTMLParser()
# raw_html = html_parser.unescape(
# raw_html.decode('utf-8')
# )
# except Exception as e:
# logging.error(e)
self.soup = bs4.BeautifulSoup(raw_html, 'html.parser')
def sanitize(self):
self.strip_styles()
self.strip_unused_spans()
self.strip_comments()
self.filter_html_remove_empty_tags()
self.remove_element_ids()
self.remove_element_classes()
self.remove_link_redirects()
self.fix_heading_strongs()
return self.soup
def remove_link_redirects(self):
a_tags = self.soup.find_all('a')
for a in a_tags:
href = a.get('href')
if href.startswith('https://www.google.com/url?q='):
parsed = urlparse.urlparse(href)
parsed_q = urlparse.parse_qs(parsed.query)['q']
if len(parsed_q) > 0:
href = parsed_q[0]
a['href'] = href
def fix_heading_strongs(self):
"""If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight."""
headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# Double next strong tags often appear. Repeat until no results found.
stripped = True
while stripped:
stripped = False
for heading in headings:
if len(heading.contents) == 1 and heading.contents[0].name == 'strong':
heading.contents[0].unwrap()
stripped = True
def strip_unused_spans(self):
spans = self.soup.find_all('span')
for span in spans:
if len(span.get_text().strip()) == 0:
span.decompose()
else:
span.unwrap()
def strip_comments(self):
divs = self.soup.find_all('div')
for div in divs:
div.decompose()
sups = self.soup.find_all('sup')
for sup in sups:
sup.decompose()
def strip_styles(self):
del self.soup['style']
# find any style sheets so we can find bold/italics classes
bold_selectors = []
italic_selectors = []
for style_tag in self.soup.find_all('style', type='text/css'):
# print "STYLE_TAG: {}".format(style_tag)
if not style_tag.string:
continue
# print "STYLE_TAG.STRING: {}".format(style_tag.string)
sheet = cssutils.parseString(style_tag.string)
for rule in sheet:
# print "RULE: {}".format(rule)
if rule.type == rule.STYLE_RULE:
for property in rule.style:
# print " PROPERTY: {}".format(property)
if property.name == 'font-weight' and property.value == '700':
bold_selector = rule.selectorText.replace('.', '')
if bold_selector[0:1] != 'h': # skip headers
# print " BOLD FOUND! SELECTOR={}".format(bold_selector)
bold_selectors.append(bold_selector)
break
elif property.name == 'font-style' and property.value == 'italic':
italic_selector = rule.selectorText.replace('.', '')
if italic_selector[0:1] != 'h': # skip headers
# print " ITALIC FOUND! SELECTOR={}".format(italic_selector)
italic_selectors.append(italic_selector)
break
bold_selector_set = set(bold_selectors)
italic_selector_set = set(italic_selectors)
# print ('bold_selectors: {}'.format(bold_selector_set))
# print ('italic_selectors: {}'.format(italic_selector_set))
els = self.soup.find_all()
for el in els:
style = el.get('style')
current_tag_name = el.name
classs = el.get('class')
if classs:
class_set = set(classs)
else:
class_set = set([])
if (style and 'font-weight:700' in style) or class_set.intersection(bold_selector_set):
el.name = 'strong'
strong_tag = self.soup.new_tag(current_tag_name)
el.wrap(strong_tag)
elif (style and 'font-style:italic' in style) or class_set.intersection(italic_selector_set):
el.name = 'em'
em_tag = self.soup.new_tag(current_tag_name)
el.wrap(em_tag)
del el['style']
def filter_html_remove_empty_tags(self):
"""Strip strong tags that wrap an entire header's text."""
# Removing an empty child sometimes creates an empty parent
# Repeat until no results found
stripped = True
while stripped:
stripped = False
for element in self.soup.find_all():
if len(element.get_text().strip()) == 0:
contents = [c for c in element.contents if c]
if element.name in ALLOWED_EMPTY_TAGS:
element.string = '\007'
if not contents and element.name not in ALLOWED_EMPTY_TAGS:
element.decompose()
stripped = True
def remove_element_ids(self):
all_nodes = self.soup.find_all(True)
for node in all_nodes:
if node.get('id'):
del node['id']
def remove_element_classes(self):
for tag in self.soup.findAll(True):
# remove class, set it to empty
setattr(tag, 'class', [])
def remove_trailing_brs(self):
all_nodes = self.soup.find_all('br')
for node in all_nodes:
sub_node = node.find_all('br')
for node_in_node in sub_node:
node_in_node.unwrap()
if __name__ == '__main__':
html_file = open("./renderer_test_data/summary.html")
doc = html_file.read()
sanitizer = BaseSanitizer(doc)
clean_soup = sanitizer.sanitize()
print clean_soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment