Skip to content

Instantly share code, notes, and snippets.

@voroninman
Last active February 18, 2017 16:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save voroninman/fdd91e936722450617a215c8c927e8fd to your computer and use it in GitHub Desktop.
Save voroninman/fdd91e936722450617a215c8c927e8fd to your computer and use it in GitHub Desktop.
Download and save PostgreSQL documentation as a single HTML-page replacing URLs with HTML anchors
"""
Download and save PostgreSQL documentation as a single
HTML-page replacing URLs with HTML anchors.
To convert the resulting HTML file to an e-book use
appropriate online services.
Be carful opennig it in a browser. The resulting HTML
could be heavy.
"""
import re
import requests
_session = requests.Session()
version = '9.6'
base_url = 'https://www.postgresql.org/docs/{0}/static/'.format(version)
stop_at_page_file = 'client-interfaces.html'
def extract_page_html(html):
match = re.search(
r'<h1.+<div class=\"NAVFOOTER\">',
html,
flags=re.DOTALL)
return match.group()[:-23]
def search_next_page_file(html):
match = re.search(
r'([^\"]+)\"\s+accesskey=\s*\"N\">Next</a>',
html)
if match:
return match.group(1)
def replace_links(html):
return re.sub(
r'href=\s*"([\w\-]+\.html)(?:#[\w\-]+)?"',
r'href="#\1"',
html)
with open('postgresql-{0}-docs.html'.format(version), 'w') as f:
f.write('<!doctype html><html><body>')
next_page_file = 'index.html'
while True:
page_file = next_page_file
url = base_url + page_file
html = _session.get(url).text
page_html = extract_page_html(html)
page_html = replace_links(page_html)
f.write('<a name="{0}"></a>'.format(page_file))
f.write(page_html.encode('utf8'))
print(url)
next_page_file = search_next_page_file(html)
if not next_page_file or next_page_file == stop_at_page_file:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment