Skip to content

Instantly share code, notes, and snippets.

@MainasuK
Created August 28, 2017 00:02
Show Gist options
  • Save MainasuK/4700fd0d3e5adfbf8e27198272fdf950 to your computer and use it in GitHub Desktop.
Save MainasuK/4700fd0d3e5adfbf8e27198272fdf950 to your computer and use it in GitHub Desktop.
Apple document pages spider
import requests
import sys
from bs4 import BeautifulSoup
from sys import stdin
# usage:
# echo 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html' | python3 document_urls.py
# Use this script get link of pages for *one* apple programming guide document
def parse_next_page_url(currentURL):
response = requests.get(currentURL)
soup = BeautifulSoup(response.content, 'html.parser')
# a tag style (old style document)
# https://developer.apple.com/library/content/documentation/AudioVideo/Conceptual/MediaPlaybackGuide/Contents/Resources/en.lproj/RevisionHistory.html
for aTag in soup.findAll('a', { 'class' : 'nextLink' }):
try:
href = aTag['href']
nextHTML = href.split('#')[0]
nextHTML = href.split('/', 1)[1]
base = currentURL.rsplit('/', 2)[0]
nextURL = ''.join([base, '/', nextHTML])
print(nextURL)
return nextURL
except IndexError:
return ""
# p Tag style (some document after 2016)
# https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html
for pTag in soup.findAll('p', { 'class' : 'next-link' }):
try:
aTag = pTag.findAll('a').pop()
href = aTag['href']
nextHTML = href.split('#')[0]
base = currentURL.rsplit('/', 1)[0]
nextURL = ''.join([base, '/', nextHTML])
print(nextURL)
return nextURL
except IndexError:
return ""
# Debug
# print(response.content)
# print('Parse failed for: ' + currentURL)
return ""
def main(argv = None):
if argv is None:
argv = sys.argv
# p tag style:
# currentURL = 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html'
currentURL = stdin.readline().strip()
urls = [currentURL]
while True:
currentURL = parse_next_page_url(currentURL)
if "" != currentURL:
urls.append(currentURL)
continue
else:
break
with open('document_urls.txt', 'w') as file:
for url in urls:
file.write("%s\n" % url)
if __name__ == "__main__":
sys.exit(main())
@MainasuK
Copy link
Author

MainasuK commented Aug 28, 2017

Automator script:

Save URLs as PDFs (via Safari):
https://www.dropbox.com/s/mrotz0ymmvwfi5q/Save%20URLs%20as%20PDFs.workflow.zip?dl=0

merge PDF
https://www.dropbox.com/s/1tsu2rt5jo02apk/Merge%20PDF.workflow.zip?dl=0

Before use that script. Modify literal string to your language first. :D

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment