csvoss/extract.py

## extract.py
"""
Parse out text, links, images, and more from an HTML file.

Modified from extract.py in https://github.com/fephsun/dialup.

For example:
    import extract
    e = extract.ParsedWebpage("http://en.wikipedia.org/wiki/Frog")
    print e.title
    print e.text
"""

import copy
import json
import re
from urlparse import urljoin

import bs4
import requests

class ParsedWebpage(object):
    def __init__(self, url):
        self.url = url

        # Raw HTML
        response = requests.get(url)
        self.html = response.text

        self.soup = bs4.BeautifulSoup(self.html, "html.parser")

        # Delete <script> and <style> tags, comments, and <!DOCTYPE>.
        # For some reason, doing this twice removes some sticky cases.
        for i in range(2):
            [s.extract() for s in self.soup.find_all('script')]
            [s.extract() for s in self.soup.find_all('style')]
            [s.extract() for s in self.soup.find_all('form')]
            comments = self.soup.findAll(text=lambda text:isinstance(text, bs4.Comment))
            [comment.extract() for comment in comments]
            new_html = re.sub("<!--.*?-->", "", unicode(self.soup))
            new_html = re.sub("<!DOCTYPE[^>]*>", "", new_html)
            self.soup = bs4.BeautifulSoup(new_html, "html.parser")

        # This should be something acceptable to read to the user
        # as the webpage's title.
        self.title = self.soup.title.string

        # Replace images with descriptions of those images.
        def my_replace(match):
            raw_tag = match.group()
            img_soup = bs4.BeautifulSoup(raw_tag, "html.parser")
            src = img_soup.img.get("src")
            alt = img_soup.img.get("alt")

            retval = " An image"
            if alt:
                retval += " of %s" % alt
            return retval + '. '

        new_html = re.sub("<img[^>]*\>[^>]*<\\img\>", my_replace, unicode(self.soup))
        new_html = re.sub("<img[^>]*\>", my_replace, new_html)
        self.soup = bs4.BeautifulSoup(new_html, "html.parser")

        texts = self.soup.find_all(text=True)

        # This should be the human-readable text of the page.
        self.text = ' '.join(texts)
	"""
	Parse out text, links, images, and more from an HTML file.

	Modified from extract.py in https://github.com/fephsun/dialup.

	For example:
	import extract
	e = extract.ParsedWebpage("http://en.wikipedia.org/wiki/Frog")
	print e.title
	print e.text
	"""

	import copy
	import json
	import re
	from urlparse import urljoin

	import bs4
	import requests

	class ParsedWebpage(object):
	def __init__(self, url):
	self.url = url

	# Raw HTML
	response = requests.get(url)
	self.html = response.text

	self.soup = bs4.BeautifulSoup(self.html, "html.parser")

	# Delete <script> and <style> tags, comments, and <!DOCTYPE>.
	# For some reason, doing this twice removes some sticky cases.
	for i in range(2):
	[s.extract() for s in self.soup.find_all('script')]
	[s.extract() for s in self.soup.find_all('style')]
	[s.extract() for s in self.soup.find_all('form')]
	comments = self.soup.findAll(text=lambda text:isinstance(text, bs4.Comment))
	[comment.extract() for comment in comments]
	new_html = re.sub("<!--.*?-->", "", unicode(self.soup))
	new_html = re.sub("<!DOCTYPE[^>]*>", "", new_html)
	self.soup = bs4.BeautifulSoup(new_html, "html.parser")

	# This should be something acceptable to read to the user
	# as the webpage's title.
	self.title = self.soup.title.string

	# Replace images with descriptions of those images.
	def my_replace(match):
	raw_tag = match.group()
	img_soup = bs4.BeautifulSoup(raw_tag, "html.parser")
	src = img_soup.img.get("src")
	alt = img_soup.img.get("alt")

	retval = " An image"
	if alt:
	retval += " of %s" % alt
	return retval + '. '

	new_html = re.sub("<img[^>]\>[^>]<\\img\>", my_replace, unicode(self.soup))
	new_html = re.sub("<img[^>]*\>", my_replace, new_html)
	self.soup = bs4.BeautifulSoup(new_html, "html.parser")

	texts = self.soup.find_all(text=True)

	# This should be the human-readable text of the page.
	self.text = ' '.join(texts)