Skip to content

Instantly share code, notes, and snippets.

@mrabbitt
Created November 20, 2012 21:26
Show Gist options
  • Save mrabbitt/4121269 to your computer and use it in GitHub Desktop.
Save mrabbitt/4121269 to your computer and use it in GitHub Desktop.
Script to download all PDF files for a particular version of Splunk documentation.
#!/usr/bin/env python
'''
Script to download all PDF files for a particular version of Splunk documentation.
Requirements:
requests: http://docs.python-requests.org
beautifulsoup4: http://www.crummy.com/software/BeautifulSoup/
(pip install -U requests beautifulsoup4)
Author: Michael Rabbitt (https://github.com/mrabbitt)
'''
from __future__ import unicode_literals, print_function
import os.path
import shutil
import re
import sys
import tempfile
import requests
from bs4 import BeautifulSoup
manual_link_regex = re.compile(r'^/Documentation/Splunk/([\d.]+)/([^/]+)/(.+)$')
filename_regex = re.compile(r'filename="([^"]+)"')
def getDownloadFileName(response, default_name):
'''Determines file name from a response object based on its headers
Rerverts to `default_name` if no name is suggested by the response headers.'''
if response.headers.has_key('content-disposition'):
match = filename_regex.match(response.headers['content-disposition'])
if match:
return match.group(1)
return default_name
def main(download_directory, target_version):
response = requests.get('http://docs.splunk.com/Documentation/Splunk/{0}'.format(target_version))
page = BeautifulSoup(response.text)
manual_links = [div.find('a') for div in page.find_all('div', class_='manualmodule')]
for manual_link in manual_links:
match = manual_link_regex.match(manual_link.attrs['href'])
doc_description = manual_link.text.strip()
(version, section, docname) = match.groups()
pdf_url = 'http://docs.splunk.com/index.php?title=Documentation:Splunk:{0}:{1}:{2}&action=pdfbook'.format(section, docname, version)
print('Downloading "{0}" from <{1}>...'.format(doc_description, pdf_url))
pdf_response = requests.get(pdf_url)
file_name = getDownloadFileName(pdf_response, 'Splunk-{0}-{1}.pdf'.format(version, section))
with tempfile.NamedTemporaryFile(suffix=file_name, delete=False) as temp_file:
temp_file.write(pdf_response.content)
target_path = os.path.join(download_directory, file_name)
print('Moving temporary file to {0}'.format(target_path))
shutil.move(temp_file.name, target_path)
print('Complete')
if __name__ == '__main__':
if len(sys.argv) == 3:
download_directory = sys.argv[1]
target_version = sys.argv[2] if len(sys.argv) == 3 else 'latest'
if os.path.isdir(download_directory):
main(download_directory, target_version)
else:
print('No such directory: {0}'.format(download_directory), file=sys.stderr)
else:
print('''Usage: {0} PATH_TO_DOWNLOAD_DIR VERSION'''.format(os.path.basename(__file__)), file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment