Skip to content

Instantly share code, notes, and snippets.

@wjkennedy
Created March 19, 2024 15:32
Show Gist options
  • Save wjkennedy/dcf48c897294ab7f42376f0d5f53303b to your computer and use it in GitHub Desktop.
Save wjkennedy/dcf48c897294ab7f42376f0d5f53303b to your computer and use it in GitHub Desktop.
import requests
from urllib.parse import urlparse, urljoin
import os
import html2text
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
# Configuration
sitemap_url = "https://support.atlassian.com/sitemap.xml"
domain = "support.atlassian.com"
output_dir = "downloaded_markdown"
os.makedirs(output_dir, exist_ok=True)
def is_internal(url):
return urlparse(url).netloc == domain
def fetch_content(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.content
except requests.RequestException as e:
print(f"Request failed: {e}")
return None
def save_markdown(url, content):
filename = urlparse(url).path.strip("/").replace("/", "_")
if not filename:
filename = "index"
filename = os.path.join(output_dir, f"{filename}.md")
with open(filename, "w", encoding="utf-8") as file:
file.write(content)
print(f"Saved {filename}")
def crawl_page(url, visited):
if url in visited:
return
visited.add(url)
print(f"Crawling: {url}")
html = fetch_content(url)
if html is None:
return
# Convert HTML to Markdown
h = html2text.HTML2Text()
h.ignore_links = False
markdown = h.handle(html.decode('utf-8'))
# Save the markdown content
save_markdown(url, markdown)
# Find all links
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a', href=True):
href = link['href']
# Resolve relative links
full_url = urljoin(url, href)
if is_internal(full_url) and full_url not in visited:
crawl_page(full_url, visited)
def process_sitemap(url, visited_urls):
content = fetch_content(url)
if content is None:
return
root = ET.fromstring(content)
for sitemap in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
loc = sitemap.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
process_sitemap(loc, visited_urls)
for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
if is_internal(loc) and loc not in visited_urls:
crawl_page(loc, visited_urls)
def main():
visited_urls = set()
process_sitemap(sitemap_url, visited_urls)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment