Skip to content

Instantly share code, notes, and snippets.

@acdha
Last active September 25, 2015 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save acdha/987031a637b3a8b544d8 to your computer and use it in GitHub Desktop.
Save acdha/987031a637b3a8b544d8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding: utf-8
"""Attempting to mop up Thomas XML
`pip install requests beautifulsoup4 html5lib lxml`
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from urlparse import urljoin
import sys
import re
import requests
from bs4 import BeautifulSoup
base_url = sys.argv[1]
# If you spoof a known browser, you get usable Printer-Friendly links:
resp = requests.get(base_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT x.y; rv:10.0) Gecko/20100101 Firefox/10.0'})
raw_html = resp.content
raw_html = re.sub(r'<bgcolor[^>]+>', '', raw_html, flags=re.IGNORECASE)
# Yes, really:
# raw_html = re.sub(r'<a href="/\[<a.+\s/`/C\?query:([^"]+)" Printer Friendly</a>',
# '<a href="C?\\1">Printer Friendly</a>',
# raw_html,
# flags=re.IGNORECASE)
soup = BeautifulSoup(raw_html)
# See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing
print(soup.prettify(formatter='minimal').encode('utf-8'))
for link in soup.find_all('a'):
print(urljoin(base_url, link.get('href')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment