Skip to content

Instantly share code, notes, and snippets.

@csvoss
Created November 6, 2015 04:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save csvoss/c5b90daf5d4dfa6b300b to your computer and use it in GitHub Desktop.
Save csvoss/c5b90daf5d4dfa6b300b to your computer and use it in GitHub Desktop.
Convert a URL to text
"""
Parse out text, links, images, and more from an HTML file.
Modified from extract.py in https://github.com/fephsun/dialup.
For example:
import extract
e = extract.ParsedWebpage("http://en.wikipedia.org/wiki/Frog")
print e.title
print e.text
"""
import copy
import json
import re
from urlparse import urljoin
import bs4
import requests
class ParsedWebpage(object):
def __init__(self, url):
self.url = url
# Raw HTML
response = requests.get(url)
self.html = response.text
self.soup = bs4.BeautifulSoup(self.html, "html.parser")
# Delete <script> and <style> tags, comments, and <!DOCTYPE>.
# For some reason, doing this twice removes some sticky cases.
for i in range(2):
[s.extract() for s in self.soup.find_all('script')]
[s.extract() for s in self.soup.find_all('style')]
[s.extract() for s in self.soup.find_all('form')]
comments = self.soup.findAll(text=lambda text:isinstance(text, bs4.Comment))
[comment.extract() for comment in comments]
new_html = re.sub("<!--.*?-->", "", unicode(self.soup))
new_html = re.sub("<!DOCTYPE[^>]*>", "", new_html)
self.soup = bs4.BeautifulSoup(new_html, "html.parser")
# This should be something acceptable to read to the user
# as the webpage's title.
self.title = self.soup.title.string
# Replace images with descriptions of those images.
def my_replace(match):
raw_tag = match.group()
img_soup = bs4.BeautifulSoup(raw_tag, "html.parser")
src = img_soup.img.get("src")
alt = img_soup.img.get("alt")
retval = " An image"
if alt:
retval += " of %s" % alt
return retval + '. '
new_html = re.sub("<img[^>]*\>[^>]*<\\img\>", my_replace, unicode(self.soup))
new_html = re.sub("<img[^>]*\>", my_replace, new_html)
self.soup = bs4.BeautifulSoup(new_html, "html.parser")
texts = self.soup.find_all(text=True)
# This should be the human-readable text of the page.
self.text = ' '.join(texts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment