rbatista191/update_markdown_redirect_urls.py

## update_markdown_redirect_urls.py
"""
Markdown URL Redirect Updater

This script updates URLs in Markdown files that lead to 301 redirects. It traverses
through a specified directory (and its subdirectories), finds all .md files, and
updates any URLs that result in a 301 redirect to their final destination.

Usage:
1. Ensure you have the 'requests' library installed: pip install requests
2. Place this script in a sibling directory to the 'content' folder containing your blog posts
3. Run the script: python update_md_urls.py

The script will automatically process all .md files in the "../content/docs/blog"
directory and its subdirectories.

Note: This script makes HTTP requests to each URL found in your Markdown files.
Depending on the number of files and links, this process might take some time.
Be mindful of rate limiting if you're making many requests to the same domain.

Author: rbatista191
Date: 24/06/2024
"""

import os
import re
import requests
from urllib.parse import urlparse, urljoin

def update_urls_in_file(file_path):
    """
    Update URLs in a single Markdown file.

    This function reads a Markdown file, finds all URLs, checks if they lead
    to a 301 redirect, and updates them to their final destination if they do.

    Args:
    file_path (str): The path to the Markdown file to be processed.

    Returns:
    None
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Regular expression to find URLs in Markdown
    url_pattern = r'\[([^\]]+)\]\(([^)]+)\)'

    def replace_url(match):
        """
        Check a single URL for redirects and replace if necessary.

        Args:
        match (re.Match): A regex match object containing the Markdown link.

        Returns:
        str: The original Markdown link or an updated one if a redirect was found.
        """
        link_text, url = match.groups()
        parsed_url = urlparse(url)

        if parsed_url.scheme and parsed_url.netloc:
            try:
                response = requests.head(url, allow_redirects=True, timeout=5)
                if response.status_code == 200 and response.url != url:
                    return f'[{link_text}]({response.url})'
            except requests.RequestException:
                pass

        return match.group()

    updated_content = re.sub(url_pattern, replace_url, content)

    if updated_content != content:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(updated_content)
        print(f"Updated: {file_path}")
    else:
        print(f"No changes: {file_path}")

def process_directory(directory):
    """
    Process all Markdown files in a directory and its subdirectories.

    This function walks through the specified directory and all its
    subdirectories, finding all .md files and updating their URLs.

    Args:
    directory (str): The path to the directory to be processed.

    Returns:
    None
    """
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                update_urls_in_file(file_path)

if __name__ == "__main__":
    # Get the current script's directory
    script_dir = os.path.dirname(os.path.abspath(__file__))

    # Construct the path to the blog directory
    blog_dir = os.path.abspath(os.path.join(script_dir, "..", "content", "docs", "blog"))

    print(f"Processing Markdown files in: {blog_dir}")
    process_directory(blog_dir)
    print("Processing complete.")
	"""
	Markdown URL Redirect Updater

	This script updates URLs in Markdown files that lead to 301 redirects. It traverses
	through a specified directory (and its subdirectories), finds all .md files, and
	updates any URLs that result in a 301 redirect to their final destination.

	Usage:
	1. Ensure you have the 'requests' library installed: pip install requests
	2. Place this script in a sibling directory to the 'content' folder containing your blog posts
	3. Run the script: python update_md_urls.py

	The script will automatically process all .md files in the "../content/docs/blog"
	directory and its subdirectories.

	Note: This script makes HTTP requests to each URL found in your Markdown files.
	Depending on the number of files and links, this process might take some time.
	Be mindful of rate limiting if you're making many requests to the same domain.

	Author: rbatista191
	Date: 24/06/2024
	"""

	import os
	import re
	import requests
	from urllib.parse import urlparse, urljoin

	def update_urls_in_file(file_path):
	"""
	Update URLs in a single Markdown file.

	This function reads a Markdown file, finds all URLs, checks if they lead
	to a 301 redirect, and updates them to their final destination if they do.

	Args:
	file_path (str): The path to the Markdown file to be processed.

	Returns:
	None
	"""
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Regular expression to find URLs in Markdown
	url_pattern = r'\[([^\]]+)\]\(([^)]+)\)'

	def replace_url(match):
	"""
	Check a single URL for redirects and replace if necessary.

	Args:
	match (re.Match): A regex match object containing the Markdown link.

	Returns:
	str: The original Markdown link or an updated one if a redirect was found.
	"""
	link_text, url = match.groups()
	parsed_url = urlparse(url)

	if parsed_url.scheme and parsed_url.netloc:
	try:
	response = requests.head(url, allow_redirects=True, timeout=5)
	if response.status_code == 200 and response.url != url:
	return f'[{link_text}]({response.url})'
	except requests.RequestException:
	pass

	return match.group()

	updated_content = re.sub(url_pattern, replace_url, content)

	if updated_content != content:
	with open(file_path, 'w', encoding='utf-8') as file:
	file.write(updated_content)
	print(f"Updated: {file_path}")
	else:
	print(f"No changes: {file_path}")

	def process_directory(directory):
	"""
	Process all Markdown files in a directory and its subdirectories.

	This function walks through the specified directory and all its
	subdirectories, finding all .md files and updating their URLs.

	Args:
	directory (str): The path to the directory to be processed.

	Returns:
	None
	"""
	for root, _, files in os.walk(directory):
	for file in files:
	if file.endswith('.md'):
	file_path = os.path.join(root, file)
	update_urls_in_file(file_path)

	if __name__ == "__main__":
	# Get the current script's directory
	script_dir = os.path.dirname(os.path.abspath(__file__))

	# Construct the path to the blog directory
	blog_dir = os.path.abspath(os.path.join(script_dir, "..", "content", "docs", "blog"))

	print(f"Processing Markdown files in: {blog_dir}")
	process_directory(blog_dir)
	print("Processing complete.")