Skip to content

Instantly share code, notes, and snippets.

@meatyite
Last active June 9, 2019 11:14
Show Gist options
  • Save meatyite/fe27d26c5102728df3dc6eb3a65f2e53 to your computer and use it in GitHub Desktop.
Save meatyite/fe27d26c5102728df3dc6eb3a65f2e53 to your computer and use it in GitHub Desktop.
Wikipedia article downloader (HTML/PDF)
#!/usr/bin/python3
# example usage:
# ./wikipedia-dl.py https://en.wikipedia.org/wiki/Wikipedia
# you could also specify which format you want. for example:
# ./wikipedia-dl.py https://en.wikipedia.org/wiki/Wikipedia html
from sys import argv
from urllib.parse import urlparse
import requests
import lxml.html
def get_pagename_d(url):
page = urlparse(url)
_page_name = page.path.split('/')
page_name = '/'.join(_page_name[2:]).replace(':_', ': ')
return (page_name, page)
def download_pdf(url):
page_name, page = get_pagename_d(url)
new_uri = '{uri.scheme}://{uri.netloc}/wiki/Special:ElectronPdf'.format(uri=page)
printable_pdf = requests.post(
new_uri,
allow_redirects=True,
params={'action': 'redirect-to-electron',
'page': page_name}).content
open(page_name.replace('/', ' - ') + '.pdf', 'wb').write(printable_pdf)
def download_html(url):
page_name, page = get_pagename_d(url)
uri_scheme = '{uri.scheme}://{uri.netloc}/'.format(uri=page)
printable_html = requests.get(
uri_scheme + 'w/index.php',
params={
'title': page_name,
'printable': 'yes'
}
).content.decode()
printable_html = lxml.html.make_links_absolute(printable_html, uri_scheme)
open(page_name.replace('/', ' - ') + '.html', 'wb').write(printable_html.encode('utf-8'))
if __name__ == '__main__':
if len(argv) >= 2:
if argv[2] == 'pdf':
download_pdf(argv[1])
elif argv[2] == 'html':
download_html(argv[1])
else:
download_pdf(argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment