Last active
June 24, 2024 21:58
-
-
Save rbatista191/a577063b743c32b89b2e51f9ec93b4ea to your computer and use it in GitHub Desktop.
Markdown URL Redirect Updater: This script updates URLs in Markdown files that lead to 301 redirects. It traverses through a specified directory (and its subdirectories), finds all .md files, and updates any URLs that result in a 301 redirect to their final destination.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Markdown URL Redirect Updater | |
This script updates URLs in Markdown files that lead to 301 redirects. It traverses | |
through a specified directory (and its subdirectories), finds all .md files, and | |
updates any URLs that result in a 301 redirect to their final destination. | |
Usage: | |
1. Ensure you have the 'requests' library installed: pip install requests | |
2. Place this script in a sibling directory to the 'content' folder containing your blog posts | |
3. Run the script: python update_md_urls.py | |
The script will automatically process all .md files in the "../content/docs/blog" | |
directory and its subdirectories. | |
Note: This script makes HTTP requests to each URL found in your Markdown files. | |
Depending on the number of files and links, this process might take some time. | |
Be mindful of rate limiting if you're making many requests to the same domain. | |
Author: rbatista191 | |
Date: 24/06/2024 | |
""" | |
import os | |
import re | |
import requests | |
from urllib.parse import urlparse, urljoin | |
def update_urls_in_file(file_path): | |
""" | |
Update URLs in a single Markdown file. | |
This function reads a Markdown file, finds all URLs, checks if they lead | |
to a 301 redirect, and updates them to their final destination if they do. | |
Args: | |
file_path (str): The path to the Markdown file to be processed. | |
Returns: | |
None | |
""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
# Regular expression to find URLs in Markdown | |
url_pattern = r'\[([^\]]+)\]\(([^)]+)\)' | |
def replace_url(match): | |
""" | |
Check a single URL for redirects and replace if necessary. | |
Args: | |
match (re.Match): A regex match object containing the Markdown link. | |
Returns: | |
str: The original Markdown link or an updated one if a redirect was found. | |
""" | |
link_text, url = match.groups() | |
parsed_url = urlparse(url) | |
if parsed_url.scheme and parsed_url.netloc: | |
try: | |
response = requests.head(url, allow_redirects=True, timeout=5) | |
if response.status_code == 200 and response.url != url: | |
return f'[{link_text}]({response.url})' | |
except requests.RequestException: | |
pass | |
return match.group() | |
updated_content = re.sub(url_pattern, replace_url, content) | |
if updated_content != content: | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(updated_content) | |
print(f"Updated: {file_path}") | |
else: | |
print(f"No changes: {file_path}") | |
def process_directory(directory): | |
""" | |
Process all Markdown files in a directory and its subdirectories. | |
This function walks through the specified directory and all its | |
subdirectories, finding all .md files and updating their URLs. | |
Args: | |
directory (str): The path to the directory to be processed. | |
Returns: | |
None | |
""" | |
for root, _, files in os.walk(directory): | |
for file in files: | |
if file.endswith('.md'): | |
file_path = os.path.join(root, file) | |
update_urls_in_file(file_path) | |
if __name__ == "__main__": | |
# Get the current script's directory | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Construct the path to the blog directory | |
blog_dir = os.path.abspath(os.path.join(script_dir, "..", "content", "docs", "blog")) | |
print(f"Processing Markdown files in: {blog_dir}") | |
process_directory(blog_dir) | |
print("Processing complete.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment