andreagrandi/pelican2hugo.py

## pelican2hugo.py
# Migration script from Pelican to Hugo
import os, re, shutil
from pathlib import Path

INPUT_FOLDER = "content"
OUTPUT_FOLDER = "content-hugo"

# Custom sort key function
def sort_key(path):
    # Extract the base filename without the extension
    basename = os.path.basename(path)
    # Extract the leading number before the dash
    number = basename.split('-', 1)[0]
    # Convert to integer to ensure numeric sort
    return int(number)

def get_posts_filenames(folder):
    file_list = []
    for root, _, files in os.walk(folder):
        for file in files:
            file_path = os.path.join(root, file)
            if file_path.endswith(".md"):
                if is_article(file_path):
                    file_list.append(file_path)
    return sorted(file_list, key=sort_key)

def read_file_content(file_name):
    with open(file_name, 'r') as file:
        content = file.read()
    return content

def metadata_to_yaml(metadata):
    # Split the content by double newlines to separate metadata from the main content
    lines = metadata.split("\n")

    yaml_content = "---\n"
    for line in lines:
        if ":" in line:
            key, value = line.split(": ", 1)
            key = key.lower()  # Convert keys to lowercase for consistency

            # Special case for 'Date' to remove the time part
            if key == "date":
                value = value.split(" ")[0]
                yaml_content += f"{key}: {value}\n"
            # Special case for 'Status'
            elif key == "status":
                key = "draft"
                value = "false" if value == "published" else "true"
                yaml_content += f'{key}: {value}\n'
            # Special case for 'Tags' and 'Category' to convert them into lists
            elif key == "tags":
                items = value.split(", ")
                formatted_items = "\n- ".join(items)
                yaml_content += f"{key}: \n- {formatted_items}\n"
            elif key == "category":
                key = "categories"
                items = value.split(", ")
                formatted_items = "\n- ".join(items)
                yaml_content += f"{key}: \n- {formatted_items}\n"
            elif key in ["summary"]:
                key = "description"
                yaml_content += f'{key}: "{value}"\n'
            elif key == "author":
                pass
            else:
                # For other keys, just copy the value
                yaml_content += f'{key}: "{value}"\n'

    yaml_content += "---"
    return yaml_content

def replace_indented_blocks(text):
    lines = text.split("\n")
    changed_lines = ""
    in_code_block = False

    for line in lines:
        if not line.startswith(" ") and line != "":
            if in_code_block:
                in_code_block = False
                changed_lines = changed_lines.removesuffix("\n")
                line = "```\n\n" + line
        if line.startswith("    :::"):
            in_code_block = True
            line = line.replace("    :::", "```", 1)
        if line.startswith("    ") and ":::" not in line:
            in_code_block = True
            line = line.replace("    ", "", 1)

        changed_lines += line + "\n"
    return changed_lines

def parse_year_from_metadata(metadata):
    lines = metadata.split('\n')
    year = ""

    for line in lines:
        if line.startswith("date:"):
            # Extract the date part from the line
            date_part = line.split("date:")[1].strip()
            # Extract the year from the date part
            return date_part.split("-")[0]
    return year

def parse_slug_from_metadata(metadata):
    lines = metadata.split('\n')
    slug = ""

    for line in lines:
        if line.startswith("slug:"):
            return line.split("slug:")[1].strip().replace('"', '')
    return slug

def is_article(path):
    base_name = os.path.basename(path)
    file_name_without_ext, _ = os.path.splitext(base_name)
    if file_name_without_ext.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')):
        return True
    return False

def get_article_id(path):
    if os.path.exists(path):
        # Check if the path is a directory
        if os.path.isdir(path):
            # List all files and directories in the given path
            all_items = os.listdir(path)
            return len(all_items) + 1

def find_images(text):
    # Regular expression pattern to find entries like ({static}/images/2017/10/keybase_identity.png)
    # pattern = r"\(\{static\}(/images/[^\)]+)\)"
    pattern = r"\{static\}(/images/\d{4}/\d{2}/[^)\s\"]+\.\w+)"

    # Find all matches of the pattern in the text
    matches = re.findall(pattern, text)

    # Return the extracted paths
    return matches

def get_filename(image_path):
    # Extract the file name from the given image path
    file_name = os.path.basename(image_path)
    return file_name

def copy_images_to_article_folder(images, article_path, article_id, slug):
    for image in images:
        image_path = f"{INPUT_FOLDER}{image}"
        image_name = get_filename(image_path)
        new_image_path = f"{article_path}/{article_id}-{slug}/{image_name}"
        if os.path.exists(image_path):
            shutil.copy(image_path, new_image_path)

def replace_paths_with_filenames(text):
    # Regular expression pattern to match the desired image paths
    pattern = r"\{static\}(/images/[0-9]{4}/[0-9]{2}/[^)]+\.\w+)"

    # Function to be used as the replacement in re.sub
    def replacement(match):
        # Extract the full path from the match
        full_path = match.group(1)
        # Extract just the file name
        file_name = os.path.basename(full_path)
        return file_name

    # Replace all occurrences of the pattern in the text with just their file names
    updated_text = re.sub(pattern, replacement, text)

    return updated_text

def process_files(files):
    for file in files:
        if not is_article(file):
            continue

        file_content = read_file_content(file)
        metadata, content = file_content.split("\n\n", 1)
        # Parse metadata and convert it to YAML format for Hugo
        yaml_metadata = metadata_to_yaml(metadata)

        # Parse year from metadata
        year = parse_year_from_metadata(yaml_metadata)

        # Parse slug from metadata
        slug = parse_slug_from_metadata(yaml_metadata)

        # Create the output folder if it doesn't exist
        article_path = f"{OUTPUT_FOLDER}/{year}"
        Path(article_path).mkdir(parents=True, exist_ok=True)

        # Get the article ID
        article_id = get_article_id(article_path)

        # Create the article folder if it doesn't exist
        Path(f"{article_path}/{article_id}-{slug}").mkdir(parents=True, exist_ok=True)

        # Replace indented blocks with Hugo code blocks
        content = replace_indented_blocks(content)

        # Find images in the content
        images = find_images(content)

        # Copy the images to the article folder
        copy_images_to_article_folder(images, article_path, article_id, slug)

        # Replace image paths with file names
        content = replace_paths_with_filenames(content)

        # Compose the final content using the YAML metadata and the processed content
        final_content = f"{yaml_metadata}\n\n{content}"

        # Write the final content to the output file
        output_file = f"{article_path}/{article_id}-{slug}/index.md"
        with open(output_file, 'w') as file:
            file.write(final_content)

if __name__ == "__main__":
    posts_filenames = get_posts_filenames(INPUT_FOLDER)
    process_files(posts_filenames)
	# Migration script from Pelican to Hugo
	import os, re, shutil
	from pathlib import Path

	INPUT_FOLDER = "content"
	OUTPUT_FOLDER = "content-hugo"

	# Custom sort key function
	def sort_key(path):
	# Extract the base filename without the extension
	basename = os.path.basename(path)
	# Extract the leading number before the dash
	number = basename.split('-', 1)[0]
	# Convert to integer to ensure numeric sort
	return int(number)

	def get_posts_filenames(folder):
	file_list = []
	for root, _, files in os.walk(folder):
	for file in files:
	file_path = os.path.join(root, file)
	if file_path.endswith(".md"):
	if is_article(file_path):
	file_list.append(file_path)
	return sorted(file_list, key=sort_key)

	def read_file_content(file_name):
	with open(file_name, 'r') as file:
	content = file.read()
	return content

	def metadata_to_yaml(metadata):
	# Split the content by double newlines to separate metadata from the main content
	lines = metadata.split("\n")

	yaml_content = "---\n"
	for line in lines:
	if ":" in line:
	key, value = line.split(": ", 1)
	key = key.lower() # Convert keys to lowercase for consistency

	# Special case for 'Date' to remove the time part
	if key == "date":
	value = value.split(" ")[0]
	yaml_content += f"{key}: {value}\n"
	# Special case for 'Status'
	elif key == "status":
	key = "draft"
	value = "false" if value == "published" else "true"
	yaml_content += f'{key}: {value}\n'
	# Special case for 'Tags' and 'Category' to convert them into lists
	elif key == "tags":
	items = value.split(", ")
	formatted_items = "\n- ".join(items)
	yaml_content += f"{key}: \n- {formatted_items}\n"
	elif key == "category":
	key = "categories"
	items = value.split(", ")
	formatted_items = "\n- ".join(items)
	yaml_content += f"{key}: \n- {formatted_items}\n"
	elif key in ["summary"]:
	key = "description"
	yaml_content += f'{key}: "{value}"\n'
	elif key == "author":
	pass
	else:
	# For other keys, just copy the value
	yaml_content += f'{key}: "{value}"\n'

	yaml_content += "---"
	return yaml_content

	def replace_indented_blocks(text):
	lines = text.split("\n")
	changed_lines = ""
	in_code_block = False

	for line in lines:
	if not line.startswith(" ") and line != "":
	if in_code_block:
	in_code_block = False
	changed_lines = changed_lines.removesuffix("\n")
	line = "```\n\n" + line
	if line.startswith(" :::"):
	in_code_block = True
	line = line.replace(" :::", "```", 1)
	if line.startswith(" ") and ":::" not in line:
	in_code_block = True
	line = line.replace(" ", "", 1)

	changed_lines += line + "\n"
	return changed_lines

	def parse_year_from_metadata(metadata):
	lines = metadata.split('\n')
	year = ""

	for line in lines:
	if line.startswith("date:"):
	# Extract the date part from the line
	date_part = line.split("date:")[1].strip()
	# Extract the year from the date part
	return date_part.split("-")[0]
	return year

	def parse_slug_from_metadata(metadata):
	lines = metadata.split('\n')
	slug = ""

	for line in lines:
	if line.startswith("slug:"):
	return line.split("slug:")[1].strip().replace('"', '')
	return slug

	def is_article(path):
	base_name = os.path.basename(path)
	file_name_without_ext, _ = os.path.splitext(base_name)
	if file_name_without_ext.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')):
	return True
	return False

	def get_article_id(path):
	if os.path.exists(path):
	# Check if the path is a directory
	if os.path.isdir(path):
	# List all files and directories in the given path
	all_items = os.listdir(path)
	return len(all_items) + 1

	def find_images(text):
	# Regular expression pattern to find entries like ({static}/images/2017/10/keybase_identity.png)
	# pattern = r"\(\{static\}(/images/[^\)]+)\)"
	pattern = r"\{static\}(/images/\d{4}/\d{2}/[^)\s\"]+\.\w+)"

	# Find all matches of the pattern in the text
	matches = re.findall(pattern, text)

	# Return the extracted paths
	return matches

	def get_filename(image_path):
	# Extract the file name from the given image path
	file_name = os.path.basename(image_path)
	return file_name

	def copy_images_to_article_folder(images, article_path, article_id, slug):
	for image in images:
	image_path = f"{INPUT_FOLDER}{image}"
	image_name = get_filename(image_path)
	new_image_path = f"{article_path}/{article_id}-{slug}/{image_name}"
	if os.path.exists(image_path):
	shutil.copy(image_path, new_image_path)

	def replace_paths_with_filenames(text):
	# Regular expression pattern to match the desired image paths
	pattern = r"\{static\}(/images/[0-9]{4}/[0-9]{2}/[^)]+\.\w+)"

	# Function to be used as the replacement in re.sub
	def replacement(match):
	# Extract the full path from the match
	full_path = match.group(1)
	# Extract just the file name
	file_name = os.path.basename(full_path)
	return file_name

	# Replace all occurrences of the pattern in the text with just their file names
	updated_text = re.sub(pattern, replacement, text)

	return updated_text

	def process_files(files):
	for file in files:
	if not is_article(file):
	continue

	file_content = read_file_content(file)
	metadata, content = file_content.split("\n\n", 1)
	# Parse metadata and convert it to YAML format for Hugo
	yaml_metadata = metadata_to_yaml(metadata)

	# Parse year from metadata
	year = parse_year_from_metadata(yaml_metadata)

	# Parse slug from metadata
	slug = parse_slug_from_metadata(yaml_metadata)

	# Create the output folder if it doesn't exist
	article_path = f"{OUTPUT_FOLDER}/{year}"
	Path(article_path).mkdir(parents=True, exist_ok=True)

	# Get the article ID
	article_id = get_article_id(article_path)

	# Create the article folder if it doesn't exist
	Path(f"{article_path}/{article_id}-{slug}").mkdir(parents=True, exist_ok=True)

	# Replace indented blocks with Hugo code blocks
	content = replace_indented_blocks(content)

	# Find images in the content
	images = find_images(content)

	# Copy the images to the article folder
	copy_images_to_article_folder(images, article_path, article_id, slug)

	# Replace image paths with file names
	content = replace_paths_with_filenames(content)

	# Compose the final content using the YAML metadata and the processed content
	final_content = f"{yaml_metadata}\n\n{content}"

	# Write the final content to the output file
	output_file = f"{article_path}/{article_id}-{slug}/index.md"
	with open(output_file, 'w') as file:
	file.write(final_content)

	if __name__ == "__main__":
	posts_filenames = get_posts_filenames(INPUT_FOLDER)
	process_files(posts_filenames)