codekiln/beautiful_soup_utils.py

## beautiful_soup_utils.py
from bs4 import BeautifulSoup

def get_text_from_html(html_str):
    """
    Given a string of html, return the text content,
    removing HTML contents and style artifacts.

    This function solves an issue that when pasting from Word,
    <style> tags can contain html comments that bsoup 4
    doesn't skip over when calling get_text().

    It also truncates adjacent whitespaces to one character;
    \r\n[space][tab][space][space] would become [space].
    :param html_str: string of html
    :return: text string. Two whitespaces will become one
    """
    soup = BeautifulSoup(html_str)

    for style in soup.find_all("style"):
        style.extract()

    text = soup.get_text()
    if text:
        return " ".join(text.split())
    return ""
	from bs4 import BeautifulSoup

	def get_text_from_html(html_str):
	"""
	Given a string of html, return the text content,
	removing HTML contents and style artifacts.

	This function solves an issue that when pasting from Word,
	<style> tags can contain html comments that bsoup 4
	doesn't skip over when calling get_text().

	It also truncates adjacent whitespaces to one character;
	\r\n[space][tab][space][space] would become [space].
	:param html_str: string of html
	:return: text string. Two whitespaces will become one
	"""
	soup = BeautifulSoup(html_str)

	for style in soup.find_all("style"):
	style.extract()

	text = soup.get_text()
	if text:
	return " ".join(text.split())
	return ""