MainasuK/APDPS.py

## APDPS.py
import requests
import sys
from bs4 import BeautifulSoup
from sys import stdin

# usage:
# echo 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html' | python3 document_urls.py

# Use this script get link of pages for *one* apple programming guide document

def parse_next_page_url(currentURL):
    response = requests.get(currentURL)
    soup = BeautifulSoup(response.content, 'html.parser')

    # a tag style (old style document)
    # https://developer.apple.com/library/content/documentation/AudioVideo/Conceptual/MediaPlaybackGuide/Contents/Resources/en.lproj/RevisionHistory.html
    for aTag in soup.findAll('a', { 'class' : 'nextLink' }):
        try:
            href = aTag['href']
            nextHTML = href.split('#')[0]
            nextHTML = href.split('/', 1)[1]
            base = currentURL.rsplit('/', 2)[0]
            nextURL = ''.join([base, '/', nextHTML])
            print(nextURL)
            return nextURL
        except IndexError:
            return ""

    # p Tag style (some document after 2016)
    # https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html
    for pTag in soup.findAll('p', { 'class' : 'next-link' }):
        try:
            aTag = pTag.findAll('a').pop()
            href = aTag['href']
            nextHTML = href.split('#')[0]
            base = currentURL.rsplit('/', 1)[0]
            nextURL = ''.join([base, '/', nextHTML])
            print(nextURL)
            return nextURL
        except IndexError:
            return ""

    # Debug
    # print(response.content)
    # print('Parse failed for: ' + currentURL)
    return ""

def main(argv = None):
    if argv is None:
        argv = sys.argv

    # p tag style:
    # currentURL = 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html'


    currentURL = stdin.readline().strip()

    urls = [currentURL]

    while True:
        currentURL = parse_next_page_url(currentURL)
        if "" != currentURL:
            urls.append(currentURL)
            continue
        else:
            break

    with open('document_urls.txt', 'w') as file:
        for url in urls:
            file.write("%s\n" % url)


if __name__ == "__main__":
    sys.exit(main())
	import requests
	import sys
	from bs4 import BeautifulSoup
	from sys import stdin

	# usage:
	# echo 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html' \| python3 document_urls.py

	# Use this script get link of pages for one apple programming guide document

	def parse_next_page_url(currentURL):
	response = requests.get(currentURL)
	soup = BeautifulSoup(response.content, 'html.parser')

	# a tag style (old style document)
	# https://developer.apple.com/library/content/documentation/AudioVideo/Conceptual/MediaPlaybackGuide/Contents/Resources/en.lproj/RevisionHistory.html
	for aTag in soup.findAll('a', { 'class' : 'nextLink' }):
	try:
	href = aTag['href']
	nextHTML = href.split('#')[0]
	nextHTML = href.split('/', 1)[1]
	base = currentURL.rsplit('/', 2)[0]
	nextURL = ''.join([base, '/', nextHTML])
	print(nextURL)
	return nextURL
	except IndexError:
	return ""

	# p Tag style (some document after 2016)
	# https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html
	for pTag in soup.findAll('p', { 'class' : 'next-link' }):
	try:
	aTag = pTag.findAll('a').pop()
	href = aTag['href']
	nextHTML = href.split('#')[0]
	base = currentURL.rsplit('/', 1)[0]
	nextURL = ''.join([base, '/', nextHTML])
	print(nextURL)
	return nextURL
	except IndexError:
	return ""

	# Debug
	# print(response.content)
	# print('Parse failed for: ' + currentURL)
	return ""

	def main(argv = None):
	if argv is None:
	argv = sys.argv

	# p tag style:
	# currentURL = 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html'



	currentURL = stdin.readline().strip()

	urls = [currentURL]

	while True:
	currentURL = parse_next_page_url(currentURL)
	if "" != currentURL:
	urls.append(currentURL)
	continue
	else:
	break

	with open('document_urls.txt', 'w') as file:
	for url in urls:
	file.write("%s\n" % url)


	if __name__ == "__main__":
	sys.exit(main())