-
-
Save kouloumos/e2a9c50221bf76e2e2bd4074617357f6 to your computer and use it in GitHub Desktop.
check if links are broken, in the second run replace `TARGET_URL` with "https://gnusha.org/url"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
from pathlib import Path | |
from concurrent.futures import ThreadPoolExecutor | |
from tqdm import tqdm | |
# Target URL to search for | |
TARGET_URL = "https://lists.linuxfoundation.org/pipermail" | |
def find_links_in_file(file_path): | |
"""Extract specific links from a markdown file.""" | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
# Find all markdown links [text](url) | |
links = re.findall(fr'\[([^\]]+)\]\(({TARGET_URL}[^)]+)\)', content) | |
# Find all bare links | |
bare_links = re.findall(fr'({TARGET_URL}\S+)', content) | |
# Combine all found links (take URL from markdown links) | |
all_links = [link[1] for link in links] + bare_links | |
return list(set(all_links)) # Remove duplicates | |
def check_url(url): | |
"""Check if a URL is accessible.""" | |
try: | |
response = requests.head(url, timeout=10, allow_redirects=True) | |
if response.status_code == 405: # If HEAD method not allowed, try GET | |
response = requests.get(url, timeout=10) | |
return url, response.status_code < 400, response.status_code | |
except requests.RequestException as e: | |
return url, False, str(e) | |
def main(): | |
# Get repository path from command line | |
import argparse | |
parser = argparse.ArgumentParser(description=f'Check links starting with {TARGET_URL}') | |
parser.add_argument('repo_path', help='Path to the repository') | |
args = parser.parse_args() | |
repo_path = Path(args.repo_path) | |
# Find all markdown files and collect links | |
print(f"Finding links starting with {TARGET_URL}...") | |
all_links = [] | |
for md_file in repo_path.rglob('*.md'): | |
links = find_links_in_file(md_file) | |
if links: | |
print(f"\nFound in {md_file.relative_to(repo_path)}:") | |
for link in links: | |
print(f" {link}") | |
all_links.extend(links) | |
if not all_links: | |
print(f"\nNo links starting with {TARGET_URL} found.") | |
return | |
unique_links = list(set(all_links)) | |
print(f"\nChecking {len(unique_links)} unique links...") | |
# Check all links in parallel | |
with ThreadPoolExecutor(max_workers=5) as executor: | |
results = list(tqdm( | |
executor.map(check_url, unique_links), | |
total=len(unique_links) | |
)) | |
# Print results | |
print("\nResults:") | |
print("========") | |
print("\nBroken Links:") | |
broken = False | |
for url, is_working, status in results: | |
if not is_working: | |
broken = True | |
print(f"❌ {url}") | |
print(f" Error: {status}") | |
if not broken: | |
print("No broken links found!") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cool, thanks for double checking.
check_url
for most lists.linuxfoundation.org urls is unnecessary because you can just manually check that it's gone: https://lists.linuxfoundation.org/pipermail/bitcoin-dev which would indicatepipermail/bitcoin-dev/*
will be 404 as well.