randomradio/cleaner_v2.py

## cleaner_v2.py
import bs4
import cssutils
import logging
import urlparse


class HTML_cleaner(object):

    soup = None;
    highlighting_selectors = {}

    def __init__(self, html_str, parser="html.parser"):
        self.soup = bs4.BeautifulSoup(html_str, parser)

    def clean_up(self, steps=[]):
        """ Cleaup a beautiful soup object with given steps, return cleaned up soup

        kwargs:
        steps -- steps defined for cleanup, if empty function will run through all steps
        """

        if not steps:
            self.collect_styles()
            self.remove_unused_classes(preserved=self.highlighting_selectors.keys())
            self.remove_empty_elements()
            self.fix_heading_strongs()
            self.unwrap_valina_spans()
            self.remove_link_redirects()
        else:
            for step in steps:
                try:
                    self.soup = getattr(HTML_cleaner, step)
                except AttributeError:
                    logging.warn("Step not found")

        return self.soup

    def collect_styles(self):
        """ find all highlighted text in document and remove style tags

        return class name and hex color dictionary
        """

        highlighting_selectors = {}

        for style_tag in self.soup.find_all('style', type='text/css'):
            # print "STYLE_TAG: {}".format(style_tag)
            if not style_tag.string:
                continue
            # print "STYLE_TAG.STRING: {}".format(style_tag.string)
            sheet = cssutils.parseString(style_tag.string)
            for rule in sheet:
                # print "RULE: {}".format(rule)
                if rule.type == cssutils.css.CSSRule.STYLE_RULE:
                    for selector in rule.selectorList:
                        if len(selector.selectorText.split(',')) == 1 and rule.style.length == 1:
                            if rule.style.keys()[0] == 'background-color':
                                key_name = rule.style.keys()[0]
                                hex_rule = rule.style.getProperty(key_name)
                                highlighting_selectors[selector.selectorText.split(',')[0]] = hex_rule.value

            style_tag.decompose()

        self.highlighting_selectors = highlighting_selectors
        return highlighting_selectors

    def remove_ids(self):
        pass

    def remove_unused_classes(self, preserved=[]):
        for tag in self.soup.findAll(True):
            if 'class' in tag.attrs.keys():
                tag_classes = tag.attrs['class']
                intersect_class_name = filter(lambda x: '.%s'%x in preserved, tag_classes)
                if len(intersect_class_name) == 0:
                    del tag['class']

    def remove_empty_elements(self):
        for el in self.soup.find_all():
            if len(el.get_text().strip()) == 0:
                el.extract()
            if el.is_empty_tag:
                el.extract()

    def unwrap_valina_spans(self):
        spans = self.soup.find_all('span')
        for span in spans:
            if 'class' not in span.attrs.keys():
                span.unwrap()

    def remove_link_redirects(self):
        a_tags = self.soup.find_all('a')
        for a in a_tags:
            href = a.get('href')
            if href.startswith('https://www.google.com/url?q='):
                parsed = urlparse.urlparse(href)
                parsed_q = urlparse.parse_qs(parsed.query)['q']
                if len(parsed_q) > 0:
                    href = parsed_q[0]
                a['href'] = href

    def fix_heading_strongs(self):
        """If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight."""
        headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        # Double next strong tags often appear. Repeat until no results found.
        stripped = True
        while stripped:
            stripped = False
            for heading in headings:
                if len(heading.contents) == 1 and heading.contents[0].name == 'strong':
                    heading.contents[0].unwrap()
                    stripped = True

## doc_cleaner.py
import bs4
import cssutils
import urlparse
import logging
from HTMLParser import HTMLParser

ALLOWED_EMPTY_TAGS = ['td', 'br']

class BaseSanitizer(object):

    def __init__(self, raw_html):
        raw_html = '' if raw_html is None else raw_html
        raw_html = raw_html.replace('&nbsp;', ' ')
        raw_html = raw_html.replace('<br>', '<br/>')
        # # try to unescape content before creating soup
        # try:
        #     html_parser = HTMLParser()
        #     raw_html = html_parser.unescape(
        #         raw_html.decode('utf-8')
        #     )
        # except Exception as e:
        #     logging.error(e)

        self.soup = bs4.BeautifulSoup(raw_html, 'html.parser')

    def sanitize(self):
        self.strip_styles()
        self.strip_unused_spans()
        self.strip_comments()
        self.filter_html_remove_empty_tags()
        self.remove_element_ids()
        self.remove_element_classes()
        self.remove_link_redirects()
        self.fix_heading_strongs()
        return self.soup

    def remove_link_redirects(self):
        a_tags = self.soup.find_all('a')
        for a in a_tags:
            href = a.get('href')
            if href.startswith('https://www.google.com/url?q='):
                parsed = urlparse.urlparse(href)
                parsed_q = urlparse.parse_qs(parsed.query)['q']
                if len(parsed_q) > 0:
                    href = parsed_q[0]
                a['href'] = href

    def fix_heading_strongs(self):
        """If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight."""
        headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        # Double next strong tags often appear. Repeat until no results found.
        stripped = True
        while stripped:
            stripped = False
            for heading in headings:
                if len(heading.contents) == 1 and heading.contents[0].name == 'strong':
                    heading.contents[0].unwrap()
                    stripped = True

    def strip_unused_spans(self):
        spans = self.soup.find_all('span')
        for span in spans:
            if len(span.get_text().strip()) == 0:
                span.decompose()
            else:
                span.unwrap()

    def strip_comments(self):
        divs = self.soup.find_all('div')
        for div in divs:
            div.decompose()

        sups = self.soup.find_all('sup')
        for sup in sups:
            sup.decompose()

    def strip_styles(self):
        del self.soup['style']

        # find any style sheets so we can find bold/italics classes

        bold_selectors = []
        italic_selectors = []

        for style_tag in self.soup.find_all('style', type='text/css'):
            # print "STYLE_TAG: {}".format(style_tag)
            if not style_tag.string:
                continue
            # print "STYLE_TAG.STRING: {}".format(style_tag.string)
            sheet = cssutils.parseString(style_tag.string)
            for rule in sheet:
                # print "RULE: {}".format(rule)
                if rule.type == rule.STYLE_RULE:
                    for property in rule.style:
                        # print "  PROPERTY: {}".format(property)
                        if property.name == 'font-weight' and property.value == '700':
                            bold_selector = rule.selectorText.replace('.', '')
                            if bold_selector[0:1] != 'h':   # skip headers
                                # print "    BOLD FOUND! SELECTOR={}".format(bold_selector)
                                bold_selectors.append(bold_selector)
                            break
                        elif property.name == 'font-style' and property.value == 'italic':
                            italic_selector = rule.selectorText.replace('.', '')
                            if italic_selector[0:1] != 'h':   # skip headers
                                # print "    ITALIC FOUND! SELECTOR={}".format(italic_selector)
                                italic_selectors.append(italic_selector)
                            break

        bold_selector_set = set(bold_selectors)
        italic_selector_set = set(italic_selectors)
        # print ('bold_selectors: {}'.format(bold_selector_set))
        # print ('italic_selectors: {}'.format(italic_selector_set))

        els = self.soup.find_all()

        for el in els:
            style = el.get('style')
            current_tag_name = el.name
            classs = el.get('class')
            if classs:
                class_set = set(classs)
            else:
                class_set = set([])

            if (style and 'font-weight:700' in style) or class_set.intersection(bold_selector_set):
                el.name = 'strong'
                strong_tag = self.soup.new_tag(current_tag_name)
                el.wrap(strong_tag)
            elif (style and 'font-style:italic' in style) or class_set.intersection(italic_selector_set):
                el.name = 'em'
                em_tag = self.soup.new_tag(current_tag_name)
                el.wrap(em_tag)

            del el['style']

    def filter_html_remove_empty_tags(self):
        """Strip strong tags that wrap an entire header's text."""
        # Removing an empty child sometimes creates an empty parent
        # Repeat until no results found
        stripped = True
        while stripped:
            stripped = False
            for element in self.soup.find_all():
                if len(element.get_text().strip()) == 0:
                    contents = [c for c in element.contents if c]
                    if element.name in ALLOWED_EMPTY_TAGS:
                        element.string = '\007'
                    if not contents and element.name not in ALLOWED_EMPTY_TAGS:
                        element.decompose()
                        stripped = True

    def remove_element_ids(self):
        all_nodes = self.soup.find_all(True)
        for node in all_nodes:
            if node.get('id'):
                del node['id']

    def remove_element_classes(self):
        for tag in self.soup.findAll(True):
            # remove class, set it to empty
            setattr(tag, 'class', [])

    def remove_trailing_brs(self):
        all_nodes = self.soup.find_all('br')
        for node in all_nodes:
            sub_node = node.find_all('br')
        for node_in_node in sub_node:
            node_in_node.unwrap()


if __name__ == '__main__':
    html_file = open("./renderer_test_data/summary.html")
    doc = html_file.read()
    sanitizer = BaseSanitizer(doc)
    clean_soup = sanitizer.sanitize()
    print clean_soup
	import bs4
	import cssutils
	import logging
	import urlparse


	class HTML_cleaner(object):

	soup = None;
	highlighting_selectors = {}

	def __init__(self, html_str, parser="html.parser"):
	self.soup = bs4.BeautifulSoup(html_str, parser)

	def clean_up(self, steps=[]):
	""" Cleaup a beautiful soup object with given steps, return cleaned up soup

	kwargs:
	steps -- steps defined for cleanup, if empty function will run through all steps
	"""

	if not steps:
	self.collect_styles()
	self.remove_unused_classes(preserved=self.highlighting_selectors.keys())
	self.remove_empty_elements()
	self.fix_heading_strongs()
	self.unwrap_valina_spans()
	self.remove_link_redirects()
	else:
	for step in steps:
	try:
	self.soup = getattr(HTML_cleaner, step)
	except AttributeError:
	logging.warn("Step not found")

	return self.soup

	def collect_styles(self):
	""" find all highlighted text in document and remove style tags

	return class name and hex color dictionary
	"""

	highlighting_selectors = {}

	for style_tag in self.soup.find_all('style', type='text/css'):
	# print "STYLE_TAG: {}".format(style_tag)
	if not style_tag.string:
	continue
	# print "STYLE_TAG.STRING: {}".format(style_tag.string)
	sheet = cssutils.parseString(style_tag.string)
	for rule in sheet:
	# print "RULE: {}".format(rule)
	if rule.type == cssutils.css.CSSRule.STYLE_RULE:
	for selector in rule.selectorList:
	if len(selector.selectorText.split(',')) == 1 and rule.style.length == 1:
	if rule.style.keys()[0] == 'background-color':
	key_name = rule.style.keys()[0]
	hex_rule = rule.style.getProperty(key_name)
	highlighting_selectors[selector.selectorText.split(',')[0]] = hex_rule.value

	style_tag.decompose()

	self.highlighting_selectors = highlighting_selectors
	return highlighting_selectors

	def remove_ids(self):
	pass

	def remove_unused_classes(self, preserved=[]):
	for tag in self.soup.findAll(True):
	if 'class' in tag.attrs.keys():
	tag_classes = tag.attrs['class']
	intersect_class_name = filter(lambda x: '.%s'%x in preserved, tag_classes)
	if len(intersect_class_name) == 0:
	del tag['class']

	def remove_empty_elements(self):
	for el in self.soup.find_all():
	if len(el.get_text().strip()) == 0:
	el.extract()
	if el.is_empty_tag:
	el.extract()

	def unwrap_valina_spans(self):
	spans = self.soup.find_all('span')
	for span in spans:
	if 'class' not in span.attrs.keys():
	span.unwrap()

	def remove_link_redirects(self):
	a_tags = self.soup.find_all('a')
	for a in a_tags:
	href = a.get('href')
	if href.startswith('https://www.google.com/url?q='):
	parsed = urlparse.urlparse(href)
	parsed_q = urlparse.parse_qs(parsed.query)['q']
	if len(parsed_q) > 0:
	href = parsed_q[0]
	a['href'] = href

	def fix_heading_strongs(self):
	"""If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight."""
	headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

	# Double next strong tags often appear. Repeat until no results found.
	stripped = True
	while stripped:
	stripped = False
	for heading in headings:
	if len(heading.contents) == 1 and heading.contents[0].name == 'strong':
	heading.contents[0].unwrap()
	stripped = True