Created
March 19, 2024 15:32
-
-
Save wjkennedy/dcf48c897294ab7f42376f0d5f53303b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from urllib.parse import urlparse, urljoin | |
import os | |
import html2text | |
from bs4 import BeautifulSoup | |
import xml.etree.ElementTree as ET | |
# Configuration | |
sitemap_url = "https://support.atlassian.com/sitemap.xml" | |
domain = "support.atlassian.com" | |
output_dir = "downloaded_markdown" | |
os.makedirs(output_dir, exist_ok=True) | |
def is_internal(url): | |
return urlparse(url).netloc == domain | |
def fetch_content(url): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
return response.content | |
except requests.RequestException as e: | |
print(f"Request failed: {e}") | |
return None | |
def save_markdown(url, content): | |
filename = urlparse(url).path.strip("/").replace("/", "_") | |
if not filename: | |
filename = "index" | |
filename = os.path.join(output_dir, f"{filename}.md") | |
with open(filename, "w", encoding="utf-8") as file: | |
file.write(content) | |
print(f"Saved {filename}") | |
def crawl_page(url, visited): | |
if url in visited: | |
return | |
visited.add(url) | |
print(f"Crawling: {url}") | |
html = fetch_content(url) | |
if html is None: | |
return | |
# Convert HTML to Markdown | |
h = html2text.HTML2Text() | |
h.ignore_links = False | |
markdown = h.handle(html.decode('utf-8')) | |
# Save the markdown content | |
save_markdown(url, markdown) | |
# Find all links | |
soup = BeautifulSoup(html, 'html.parser') | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
# Resolve relative links | |
full_url = urljoin(url, href) | |
if is_internal(full_url) and full_url not in visited: | |
crawl_page(full_url, visited) | |
def process_sitemap(url, visited_urls): | |
content = fetch_content(url) | |
if content is None: | |
return | |
root = ET.fromstring(content) | |
for sitemap in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'): | |
loc = sitemap.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text | |
process_sitemap(loc, visited_urls) | |
for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'): | |
loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text | |
if is_internal(loc) and loc not in visited_urls: | |
crawl_page(loc, visited_urls) | |
def main(): | |
visited_urls = set() | |
process_sitemap(sitemap_url, visited_urls) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment