Skip to content

Instantly share code, notes, and snippets.

@pkhanpara
Created March 1, 2021 06:12
Show Gist options
  • Save pkhanpara/84259b37a1ef469738ba2a40f1f7c791 to your computer and use it in GitHub Desktop.
Save pkhanpara/84259b37a1ef469738ba2a40f1f7c791 to your computer and use it in GitHub Desktop.
Turn aws service faqs to pdf books. Useful for Solution Architect or other AWS certification exam.
#!/usr/bin/env python3
import requests
import sys
import json
from bs4 import BeautifulSoup
import pdfkit
import os
from PyPDF2 import PdfFileMerger
DOC_URL_BASE='https://aws.amazon.com/'
faq_dir = './pdf_faqs/'
# this categories can be found from https://aws.amazon.com/products/ href tags
intresting_categories = [
(1, 'analytics'),
(2, 'app_integration'),
(7,'compute_containers'),
(9, 'database'),
(19, 'networking_content_delivery'),
(17, 'migration'),
(22, 'security'),
(23, 'serverless_computing'),
(24, 'storage'),
(14, 'management_tools')
]
blacklist = 'lightsail serverlessrepo timestream cloudendure-migration'
def create_pdfs_for_allservices(chknsoup):
os.mkdir(faq_dir)
for i, cat_name in intresting_categories:
print('==========:'+ cat_name)
dir_name = faq_dir + cat_name
os.mkdir(dir_name)
for a in soup.find_all('a', href=True):
regex_str='?c='+str(i)+'&pt='
if regex_str in a['href']:
service = a['href'].split('/')[1]
if service in blacklist:
pass
else:
try:
pdfkit.from_url(DOC_URL_BASE + service + '/faqs', dir_name + '/' + service + '.pdf')
except:
pass
def merge_pdfs(faq_dir):
categories = os.listdir(faq_dir)
for category in categories:
merger = PdfFileMerger()
for pdf_file in os.listdir(faq_dir + category):
merger.append(faq_dir + category + '/' + pdf_file)
merger.write(faq_dir + category + '.pdf')
merger.close()
if __name__ == "__main__":
r = requests.get(DOC_URL_BASE + 'products/')
data = r.text
soup = BeautifulSoup(data, features="html.parser")
create_pdfs_for_allservices(soup)
merge_pdfs(faq_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment