ulfaslak/main.py

## main.py
import requests as rq
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://www.pymc.io/projects/docs/en/stable/"
# BASE_URL = "https://www.pymc.io/projects/examples/en/latest/"

PAGES_CRAWLED = 0


def find_links(article):
    links = set()
    for a in article.find_all("a", href=True):
        if (
            a["href"]
            and not a["href"].startswith("#")
            and not (
                a["href"].startswith("http") and not a["href"].startswith(BASE_URL)
            )
        ):
            link = a["href"].split("#")[0]
            if link.endswith(".html"):
                links.add(link)
    return sorted(links)


def save_and_return_links(url, skip_links=False):
    soup = BeautifulSoup(rq.get(url).text, "html.parser")
    article = soup.find("article")
    resource = url.replace(f"{BASE_URL}", "")
    with open(initial_resource.replace(".html", ".txt"), "a") as file:
        article_text = article.text
        if "Page not found" in article_text:
            raise Exception(f"Page not found: {url}")
        file.write(f"\n\n#page: {resource}\n--------\n\n{article_text}")

    global PAGES_CRAWLED
    PAGES_CRAWLED += 1
    if PAGES_CRAWLED % 10 == 0:
        print(f"Crawled {PAGES_CRAWLED} pages")

    return find_links(article) if not skip_links else []


def _get_trail(resource):
    return "/".join(resource.split("/")[:-1]) + "/" if "/" in resource else ""


def crawl_page(resource, base_url, already_visited):
    url = urljoin(base_url, resource)
    # Fetch and save the current page
    links = save_and_return_links(url, skip_links="../" in resource)

    trail = _get_trail(resource)
    new_base_url = urljoin(base_url, trail)

    # Crawl linked pages
    for link in links:
        full_url = urljoin(new_base_url, link)
        if full_url not in already_visited:
            already_visited.add(full_url)
            crawl_page(link, new_base_url, already_visited)


initial_resource = "api.html"
already_visited = set([urljoin(BASE_URL, initial_resource)])

crawl_page(initial_resource, BASE_URL, already_visited)
	import requests as rq
	import os
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	BASE_URL = "https://www.pymc.io/projects/docs/en/stable/"
	# BASE_URL = "https://www.pymc.io/projects/examples/en/latest/"

	PAGES_CRAWLED = 0


	def find_links(article):
	links = set()
	for a in article.find_all("a", href=True):
	if (
	a["href"]
	and not a["href"].startswith("#")
	and not (
	a["href"].startswith("http") and not a["href"].startswith(BASE_URL)
	)
	):
	link = a["href"].split("#")[0]
	if link.endswith(".html"):
	links.add(link)
	return sorted(links)


	def save_and_return_links(url, skip_links=False):
	soup = BeautifulSoup(rq.get(url).text, "html.parser")
	article = soup.find("article")
	resource = url.replace(f"{BASE_URL}", "")
	with open(initial_resource.replace(".html", ".txt"), "a") as file:
	article_text = article.text
	if "Page not found" in article_text:
	raise Exception(f"Page not found: {url}")
	file.write(f"\n\n#page: {resource}\n--------\n\n{article_text}")

	global PAGES_CRAWLED
	PAGES_CRAWLED += 1
	if PAGES_CRAWLED % 10 == 0:
	print(f"Crawled {PAGES_CRAWLED} pages")

	return find_links(article) if not skip_links else []


	def _get_trail(resource):
	return "/".join(resource.split("/")[:-1]) + "/" if "/" in resource else ""


	def crawl_page(resource, base_url, already_visited):
	url = urljoin(base_url, resource)
	# Fetch and save the current page
	links = save_and_return_links(url, skip_links="../" in resource)

	trail = _get_trail(resource)
	new_base_url = urljoin(base_url, trail)

	# Crawl linked pages
	for link in links:
	full_url = urljoin(new_base_url, link)
	if full_url not in already_visited:
	already_visited.add(full_url)
	crawl_page(link, new_base_url, already_visited)


	initial_resource = "api.html"
	already_visited = set([urljoin(BASE_URL, initial_resource)])

	crawl_page(initial_resource, BASE_URL, already_visited)