wjkennedy/get.py

## get.py
import requests
from urllib.parse import urlparse, urljoin
import os
import html2text
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

# Configuration
sitemap_url = "https://support.atlassian.com/sitemap.xml"
domain = "support.atlassian.com"
output_dir = "downloaded_markdown"
os.makedirs(output_dir, exist_ok=True)

def is_internal(url):
    return urlparse(url).netloc == domain

def fetch_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.content
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def save_markdown(url, content):
    filename = urlparse(url).path.strip("/").replace("/", "_")
    if not filename:
        filename = "index"
    filename = os.path.join(output_dir, f"{filename}.md")
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"Saved {filename}")

def crawl_page(url, visited):
    if url in visited:
        return
    visited.add(url)

    print(f"Crawling: {url}")
    html = fetch_content(url)
    if html is None:
        return

    # Convert HTML to Markdown
    h = html2text.HTML2Text()
    h.ignore_links = False
    markdown = h.handle(html.decode('utf-8'))

    # Save the markdown content
    save_markdown(url, markdown)

    # Find all links
    soup = BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Resolve relative links
        full_url = urljoin(url, href)
        if is_internal(full_url) and full_url not in visited:
            crawl_page(full_url, visited)

def process_sitemap(url, visited_urls):
    content = fetch_content(url)
    if content is None:
        return
    root = ET.fromstring(content)

    for sitemap in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
        loc = sitemap.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
        process_sitemap(loc, visited_urls)

    for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
        loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
        if is_internal(loc) and loc not in visited_urls:
            crawl_page(loc, visited_urls)

def main():
    visited_urls = set()
    process_sitemap(sitemap_url, visited_urls)

if __name__ == "__main__":
    main()
	import requests
	from urllib.parse import urlparse, urljoin
	import os
	import html2text
	from bs4 import BeautifulSoup
	import xml.etree.ElementTree as ET

	# Configuration
	sitemap_url = "https://support.atlassian.com/sitemap.xml"
	domain = "support.atlassian.com"
	output_dir = "downloaded_markdown"
	os.makedirs(output_dir, exist_ok=True)

	def is_internal(url):
	return urlparse(url).netloc == domain

	def fetch_content(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	return response.content
	except requests.RequestException as e:
	print(f"Request failed: {e}")
	return None

	def save_markdown(url, content):
	filename = urlparse(url).path.strip("/").replace("/", "_")
	if not filename:
	filename = "index"
	filename = os.path.join(output_dir, f"{filename}.md")
	with open(filename, "w", encoding="utf-8") as file:
	file.write(content)
	print(f"Saved {filename}")

	def crawl_page(url, visited):
	if url in visited:
	return
	visited.add(url)

	print(f"Crawling: {url}")
	html = fetch_content(url)
	if html is None:
	return

	# Convert HTML to Markdown
	h = html2text.HTML2Text()
	h.ignore_links = False
	markdown = h.handle(html.decode('utf-8'))

	# Save the markdown content
	save_markdown(url, markdown)

	# Find all links
	soup = BeautifulSoup(html, 'html.parser')
	for link in soup.find_all('a', href=True):
	href = link['href']
	# Resolve relative links
	full_url = urljoin(url, href)
	if is_internal(full_url) and full_url not in visited:
	crawl_page(full_url, visited)

	def process_sitemap(url, visited_urls):
	content = fetch_content(url)
	if content is None:
	return
	root = ET.fromstring(content)

	for sitemap in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
	loc = sitemap.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
	process_sitemap(loc, visited_urls)

	for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
	loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
	if is_internal(loc) and loc not in visited_urls:
	crawl_page(loc, visited_urls)

	def main():
	visited_urls = set()
	process_sitemap(sitemap_url, visited_urls)

	if __name__ == "__main__":
	main()