Skip to content

Instantly share code, notes, and snippets.

@rbatista191
Last active June 24, 2024 21:58
Show Gist options
  • Save rbatista191/a577063b743c32b89b2e51f9ec93b4ea to your computer and use it in GitHub Desktop.
Save rbatista191/a577063b743c32b89b2e51f9ec93b4ea to your computer and use it in GitHub Desktop.
Markdown URL Redirect Updater: This script updates URLs in Markdown files that lead to 301 redirects. It traverses through a specified directory (and its subdirectories), finds all .md files, and updates any URLs that result in a 301 redirect to their final destination.
"""
Markdown URL Redirect Updater
This script updates URLs in Markdown files that lead to 301 redirects. It traverses
through a specified directory (and its subdirectories), finds all .md files, and
updates any URLs that result in a 301 redirect to their final destination.
Usage:
1. Ensure you have the 'requests' library installed: pip install requests
2. Place this script in a sibling directory to the 'content' folder containing your blog posts
3. Run the script: python update_md_urls.py
The script will automatically process all .md files in the "../content/docs/blog"
directory and its subdirectories.
Note: This script makes HTTP requests to each URL found in your Markdown files.
Depending on the number of files and links, this process might take some time.
Be mindful of rate limiting if you're making many requests to the same domain.
Author: rbatista191
Date: 24/06/2024
"""
import os
import re
import requests
from urllib.parse import urlparse, urljoin
def update_urls_in_file(file_path):
"""
Update URLs in a single Markdown file.
This function reads a Markdown file, finds all URLs, checks if they lead
to a 301 redirect, and updates them to their final destination if they do.
Args:
file_path (str): The path to the Markdown file to be processed.
Returns:
None
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Regular expression to find URLs in Markdown
url_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
def replace_url(match):
"""
Check a single URL for redirects and replace if necessary.
Args:
match (re.Match): A regex match object containing the Markdown link.
Returns:
str: The original Markdown link or an updated one if a redirect was found.
"""
link_text, url = match.groups()
parsed_url = urlparse(url)
if parsed_url.scheme and parsed_url.netloc:
try:
response = requests.head(url, allow_redirects=True, timeout=5)
if response.status_code == 200 and response.url != url:
return f'[{link_text}]({response.url})'
except requests.RequestException:
pass
return match.group()
updated_content = re.sub(url_pattern, replace_url, content)
if updated_content != content:
with open(file_path, 'w', encoding='utf-8') as file:
file.write(updated_content)
print(f"Updated: {file_path}")
else:
print(f"No changes: {file_path}")
def process_directory(directory):
"""
Process all Markdown files in a directory and its subdirectories.
This function walks through the specified directory and all its
subdirectories, finding all .md files and updating their URLs.
Args:
directory (str): The path to the directory to be processed.
Returns:
None
"""
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.md'):
file_path = os.path.join(root, file)
update_urls_in_file(file_path)
if __name__ == "__main__":
# Get the current script's directory
script_dir = os.path.dirname(os.path.abspath(__file__))
# Construct the path to the blog directory
blog_dir = os.path.abspath(os.path.join(script_dir, "..", "content", "docs", "blog"))
print(f"Processing Markdown files in: {blog_dir}")
process_directory(blog_dir)
print("Processing complete.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment