Skip to content

Instantly share code, notes, and snippets.

@mlopezr
Last active November 19, 2017 10:04
Show Gist options
  • Save mlopezr/4c29afcab53a08877b4052aa8784058d to your computer and use it in GitHub Desktop.
Save mlopezr/4c29afcab53a08877b4052aa8784058d to your computer and use it in GitHub Desktop.
Scrape Alexa Skills Kit Documentation to eBook

Scrape Alexa Skills Kit Documentation to eBook

Downloads all sections of the online documentation available at https://developer.amazon.com/public/solutions/alexa/alexa-skills-kit/docs/alexa-skills-kit-interface-reference and converts it to eBook.

Prerequisites

pip3 install beautifulsoup4 requests

Install Pandoc using apt install pandoc or brew install pandoc or equivalent.

Use

Run in a new directory:

python3 ask-doc-scrape.py

Convert it to EPUB using ebook-convert from Calibre:

ebook-convert toc.html ask-doc.epub --breadth-first --level1-toc '//*[@class="chapter-level1"]' --level2-toc '//*[@class="chapter-level2"]' --level3-toc '//*[@class="chapter-level3"]'

Or convert from HTML to EPUB using Pandoc:

pandoc -t epub3 --epub-metadata=metadata.xml --toc --toc-depth 1 -o ask-docs.epub article*.html

You can optionally convert it to Kindle format using Calibre or Kindlegen.

import os
import requests
import re
from bs4 import BeautifulSoup, Doctype
# Get any page in the docs (we just want the navigation sidebar)
prefix = "https://developer.amazon.com"
url = prefix + "/docs/custom-skills/request-and-response-json-reference.html"
html = requests.get(url)
page = BeautifulSoup(html.text, "html5lib")
links = page.select('#docnavsidebar a')
with open("toc.html", "w") as tocfile:
tocfile.write("<html>\n<body><h1>Table of contents</h1>\n")
for i, link in enumerate(links):
relative_path = link['href']
if relative_path.startswith('/doc'):
print("Processing " + relative_path)
html = requests.get(prefix + relative_path)
# Get main article only
page = BeautifulSoup(html.text, "html5lib")
article = page.select('div.mainColumn')[0]
body = article.decode_contents()
# Replace '=/docs/...' with '=_docs_...' in body
pat = re.compile('docs\/([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
for match in pat.finditer(body):
body = body[0:match.start()] + body[match.start():match.end()].replace('/','_') + body[match.end():len(body)]
# Save the content within a HTML body with a flattened directory structure
if article is not None:
filename = relative_path[1:].replace('/','_')
with open(filename, "w") as file:
file.write("<html><head></head><body>\n")
file.write(body)
file.write("\n</body></html>")
tocfile.write("<a href='" + filename + "'>" + link.text + "</a><br>\n")
# Chapter headers without content go into an empty HTML to get a chapter needed for the ebook TOC
elif relative_path is '#':
if 'level1' in link.parent.attrs['class']:
chapterclass='chapter-level1'
hlevel='h2'
elif 'level2' in link.parent.attrs['class']:
chapterclass='chapter-level2'
hlevel='h3'
elif 'level3' in link.parent.attrs['class']:
chapterclass='chapter-level3'
hlevel='h4'
else:
chapterclass=''
hlevel='h5'
with open("chapter" + str(i) + ".html", "w") as chapterfile:
chapterfile.write("<html><head></head><body><h1 class='" + chapterclass + "'>" + link.text + "</h1></body></html>")
tocfile.write("<" + hlevel + "><a href='chapter" + str(i) + ".html'>" + link.text + "</a></" + hlevel + "><br>\n")
# Skip everything else such as external links to GitHub
else:
print("Skipping " + relative_path)
tocfile.write("</body>\n</html>\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment