mlopezr/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Scrape Alexa Skills Kit Documentation to eBook

Downloads all sections of the online documentation available at https://developer.amazon.com/public/solutions/alexa/alexa-skills-kit/docs/alexa-skills-kit-interface-reference and converts it to eBook.
Prerequisites

pip3 install beautifulsoup4 requests

Install Pandoc using apt install pandoc or brew install pandoc or equivalent.
Use

Run in a new directory:
python3 ask-doc-scrape.py

Convert it to EPUB using ebook-convert from Calibre:
ebook-convert toc.html ask-doc.epub --breadth-first --level1-toc '//*[@class="chapter-level1"]' --level2-toc '//*[@class="chapter-level2"]' --level3-toc '//*[@class="chapter-level3"]'

Or convert from HTML to EPUB using Pandoc:
pandoc -t epub3 --epub-metadata=metadata.xml --toc --toc-depth 1 -o ask-docs.epub article*.html

You can optionally convert it to Kindle format using Calibre or Kindlegen.

  
## ask-doc-scrape.py
import os
import requests
import re
from bs4 import BeautifulSoup, Doctype

# Get any page in the docs (we just want the navigation sidebar)
prefix = "https://developer.amazon.com"
url = prefix + "/docs/custom-skills/request-and-response-json-reference.html"
html = requests.get(url)
page = BeautifulSoup(html.text, "html5lib")
links = page.select('#docnavsidebar a')

with open("toc.html", "w") as tocfile:
  tocfile.write("<html>\n<body><h1>Table of contents</h1>\n")

  for i, link in enumerate(links):
    relative_path = link['href']
    if relative_path.startswith('/doc'):
      print("Processing " + relative_path)
      html = requests.get(prefix + relative_path)

      # Get main article only
      page = BeautifulSoup(html.text, "html5lib")
      article = page.select('div.mainColumn')[0]
      body = article.decode_contents()

      # Replace '=/docs/...' with '=_docs_...' in body
      pat = re.compile('docs\/([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
      for match in pat.finditer(body):
          body = body[0:match.start()] + body[match.start():match.end()].replace('/','_') + body[match.end():len(body)]

      # Save the content within a HTML body with a flattened directory structure
      if article is not None:
        filename = relative_path[1:].replace('/','_')
        with open(filename, "w") as file:
          file.write("<html><head></head><body>\n")
          file.write(body)
          file.write("\n</body></html>")
        tocfile.write("<a href='" + filename + "'>" + link.text + "</a><br>\n")

    # Chapter headers without content go into an empty HTML to get a chapter needed for the ebook TOC
    elif relative_path is '#':
        if 'level1' in link.parent.attrs['class']:
          chapterclass='chapter-level1'
          hlevel='h2'
        elif 'level2' in link.parent.attrs['class']:
          chapterclass='chapter-level2'
          hlevel='h3'
        elif 'level3' in link.parent.attrs['class']:
          chapterclass='chapter-level3'
          hlevel='h4'
        else:
          chapterclass=''
          hlevel='h5'
        with open("chapter" + str(i) + ".html", "w") as chapterfile:
          chapterfile.write("<html><head></head><body><h1 class='" + chapterclass + "'>" + link.text + "</h1></body></html>")
        tocfile.write("<" + hlevel + "><a href='chapter" + str(i) + ".html'>" + link.text + "</a></" + hlevel + "><br>\n")

    # Skip everything else such as external links to GitHub
    else:
      print("Skipping " + relative_path)

  tocfile.write("</body>\n</html>\n")
	import os
	import requests
	import re
	from bs4 import BeautifulSoup, Doctype

	# Get any page in the docs (we just want the navigation sidebar)
	prefix = "https://developer.amazon.com"
	url = prefix + "/docs/custom-skills/request-and-response-json-reference.html"
	html = requests.get(url)
	page = BeautifulSoup(html.text, "html5lib")
	links = page.select('#docnavsidebar a')

	with open("toc.html", "w") as tocfile:
	tocfile.write("<html>\n<body><h1>Table of contents</h1>\n")

	for i, link in enumerate(links):
	relative_path = link['href']
	if relative_path.startswith('/doc'):
	print("Processing " + relative_path)
	html = requests.get(prefix + relative_path)

	# Get main article only
	page = BeautifulSoup(html.text, "html5lib")
	article = page.select('div.mainColumn')[0]
	body = article.decode_contents()

	# Replace '=/docs/...' with '=_docs_...' in body
	pat = re.compile('docs\/([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
	for match in pat.finditer(body):
	body = body[0:match.start()] + body[match.start():match.end()].replace('/','_') + body[match.end():len(body)]

	# Save the content within a HTML body with a flattened directory structure
	if article is not None:
	filename = relative_path[1:].replace('/','_')
	with open(filename, "w") as file:
	file.write("<html><head></head><body>\n")
	file.write(body)
	file.write("\n</body></html>")
	tocfile.write("<a href='" + filename + "'>" + link.text + "</a><br>\n")

	# Chapter headers without content go into an empty HTML to get a chapter needed for the ebook TOC
	elif relative_path is '#':
	if 'level1' in link.parent.attrs['class']:
	chapterclass='chapter-level1'
	hlevel='h2'
	elif 'level2' in link.parent.attrs['class']:
	chapterclass='chapter-level2'
	hlevel='h3'
	elif 'level3' in link.parent.attrs['class']:
	chapterclass='chapter-level3'
	hlevel='h4'
	else:
	chapterclass=''
	hlevel='h5'
	with open("chapter" + str(i) + ".html", "w") as chapterfile:
	chapterfile.write("<html><head></head><body><h1 class='" + chapterclass + "'>" + link.text + "</h1></body></html>")
	tocfile.write("<" + hlevel + "><a href='chapter" + str(i) + ".html'>" + link.text + "</a></" + hlevel + "><br>\n")

	# Skip everything else such as external links to GitHub
	else:
	print("Skipping " + relative_path)

	tocfile.write("</body>\n</html>\n")