Skip to content

Instantly share code, notes, and snippets.

@ulfaslak
Created January 10, 2024 14:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ulfaslak/28287e95dbc14dff4928d29f0e143fb4 to your computer and use it in GitHub Desktop.
Save ulfaslak/28287e95dbc14dff4928d29f0e143fb4 to your computer and use it in GitHub Desktop.
Sphinx Documentation Scraper
import requests as rq
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BASE_URL = "https://www.pymc.io/projects/docs/en/stable/"
# BASE_URL = "https://www.pymc.io/projects/examples/en/latest/"
PAGES_CRAWLED = 0
def find_links(article):
links = set()
for a in article.find_all("a", href=True):
if (
a["href"]
and not a["href"].startswith("#")
and not (
a["href"].startswith("http") and not a["href"].startswith(BASE_URL)
)
):
link = a["href"].split("#")[0]
if link.endswith(".html"):
links.add(link)
return sorted(links)
def save_and_return_links(url, skip_links=False):
soup = BeautifulSoup(rq.get(url).text, "html.parser")
article = soup.find("article")
resource = url.replace(f"{BASE_URL}", "")
with open(initial_resource.replace(".html", ".txt"), "a") as file:
article_text = article.text
if "Page not found" in article_text:
raise Exception(f"Page not found: {url}")
file.write(f"\n\n#page: {resource}\n--------\n\n{article_text}")
global PAGES_CRAWLED
PAGES_CRAWLED += 1
if PAGES_CRAWLED % 10 == 0:
print(f"Crawled {PAGES_CRAWLED} pages")
return find_links(article) if not skip_links else []
def _get_trail(resource):
return "/".join(resource.split("/")[:-1]) + "/" if "/" in resource else ""
def crawl_page(resource, base_url, already_visited):
url = urljoin(base_url, resource)
# Fetch and save the current page
links = save_and_return_links(url, skip_links="../" in resource)
trail = _get_trail(resource)
new_base_url = urljoin(base_url, trail)
# Crawl linked pages
for link in links:
full_url = urljoin(new_base_url, link)
if full_url not in already_visited:
already_visited.add(full_url)
crawl_page(link, new_base_url, already_visited)
initial_resource = "api.html"
already_visited = set([urljoin(BASE_URL, initial_resource)])
crawl_page(initial_resource, BASE_URL, already_visited)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment