Skip to content

Instantly share code, notes, and snippets.

@andreagrandi
Last active February 11, 2024 09:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreagrandi/0a7bf6e217d6561b00b6a5de6211ddaa to your computer and use it in GitHub Desktop.
Save andreagrandi/0a7bf6e217d6561b00b6a5de6211ddaa to your computer and use it in GitHub Desktop.
Python script to migrate posts from Pelican to Hugo
# Migration script from Pelican to Hugo
import os, re, shutil
from pathlib import Path
INPUT_FOLDER = "content"
OUTPUT_FOLDER = "content-hugo"
# Custom sort key function
def sort_key(path):
# Extract the base filename without the extension
basename = os.path.basename(path)
# Extract the leading number before the dash
number = basename.split('-', 1)[0]
# Convert to integer to ensure numeric sort
return int(number)
def get_posts_filenames(folder):
file_list = []
for root, _, files in os.walk(folder):
for file in files:
file_path = os.path.join(root, file)
if file_path.endswith(".md"):
if is_article(file_path):
file_list.append(file_path)
return sorted(file_list, key=sort_key)
def read_file_content(file_name):
with open(file_name, 'r') as file:
content = file.read()
return content
def metadata_to_yaml(metadata):
# Split the content by double newlines to separate metadata from the main content
lines = metadata.split("\n")
yaml_content = "---\n"
for line in lines:
if ":" in line:
key, value = line.split(": ", 1)
key = key.lower() # Convert keys to lowercase for consistency
# Special case for 'Date' to remove the time part
if key == "date":
value = value.split(" ")[0]
yaml_content += f"{key}: {value}\n"
# Special case for 'Status'
elif key == "status":
key = "draft"
value = "false" if value == "published" else "true"
yaml_content += f'{key}: {value}\n'
# Special case for 'Tags' and 'Category' to convert them into lists
elif key == "tags":
items = value.split(", ")
formatted_items = "\n- ".join(items)
yaml_content += f"{key}: \n- {formatted_items}\n"
elif key == "category":
key = "categories"
items = value.split(", ")
formatted_items = "\n- ".join(items)
yaml_content += f"{key}: \n- {formatted_items}\n"
elif key in ["summary"]:
key = "description"
yaml_content += f'{key}: "{value}"\n'
elif key == "author":
pass
else:
# For other keys, just copy the value
yaml_content += f'{key}: "{value}"\n'
yaml_content += "---"
return yaml_content
def replace_indented_blocks(text):
lines = text.split("\n")
changed_lines = ""
in_code_block = False
for line in lines:
if not line.startswith(" ") and line != "":
if in_code_block:
in_code_block = False
changed_lines = changed_lines.removesuffix("\n")
line = "```\n\n" + line
if line.startswith(" :::"):
in_code_block = True
line = line.replace(" :::", "```", 1)
if line.startswith(" ") and ":::" not in line:
in_code_block = True
line = line.replace(" ", "", 1)
changed_lines += line + "\n"
return changed_lines
def parse_year_from_metadata(metadata):
lines = metadata.split('\n')
year = ""
for line in lines:
if line.startswith("date:"):
# Extract the date part from the line
date_part = line.split("date:")[1].strip()
# Extract the year from the date part
return date_part.split("-")[0]
return year
def parse_slug_from_metadata(metadata):
lines = metadata.split('\n')
slug = ""
for line in lines:
if line.startswith("slug:"):
return line.split("slug:")[1].strip().replace('"', '')
return slug
def is_article(path):
base_name = os.path.basename(path)
file_name_without_ext, _ = os.path.splitext(base_name)
if file_name_without_ext.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')):
return True
return False
def get_article_id(path):
if os.path.exists(path):
# Check if the path is a directory
if os.path.isdir(path):
# List all files and directories in the given path
all_items = os.listdir(path)
return len(all_items) + 1
def find_images(text):
# Regular expression pattern to find entries like ({static}/images/2017/10/keybase_identity.png)
# pattern = r"\(\{static\}(/images/[^\)]+)\)"
pattern = r"\{static\}(/images/\d{4}/\d{2}/[^)\s\"]+\.\w+)"
# Find all matches of the pattern in the text
matches = re.findall(pattern, text)
# Return the extracted paths
return matches
def get_filename(image_path):
# Extract the file name from the given image path
file_name = os.path.basename(image_path)
return file_name
def copy_images_to_article_folder(images, article_path, article_id, slug):
for image in images:
image_path = f"{INPUT_FOLDER}{image}"
image_name = get_filename(image_path)
new_image_path = f"{article_path}/{article_id}-{slug}/{image_name}"
if os.path.exists(image_path):
shutil.copy(image_path, new_image_path)
def replace_paths_with_filenames(text):
# Regular expression pattern to match the desired image paths
pattern = r"\{static\}(/images/[0-9]{4}/[0-9]{2}/[^)]+\.\w+)"
# Function to be used as the replacement in re.sub
def replacement(match):
# Extract the full path from the match
full_path = match.group(1)
# Extract just the file name
file_name = os.path.basename(full_path)
return file_name
# Replace all occurrences of the pattern in the text with just their file names
updated_text = re.sub(pattern, replacement, text)
return updated_text
def process_files(files):
for file in files:
if not is_article(file):
continue
file_content = read_file_content(file)
metadata, content = file_content.split("\n\n", 1)
# Parse metadata and convert it to YAML format for Hugo
yaml_metadata = metadata_to_yaml(metadata)
# Parse year from metadata
year = parse_year_from_metadata(yaml_metadata)
# Parse slug from metadata
slug = parse_slug_from_metadata(yaml_metadata)
# Create the output folder if it doesn't exist
article_path = f"{OUTPUT_FOLDER}/{year}"
Path(article_path).mkdir(parents=True, exist_ok=True)
# Get the article ID
article_id = get_article_id(article_path)
# Create the article folder if it doesn't exist
Path(f"{article_path}/{article_id}-{slug}").mkdir(parents=True, exist_ok=True)
# Replace indented blocks with Hugo code blocks
content = replace_indented_blocks(content)
# Find images in the content
images = find_images(content)
# Copy the images to the article folder
copy_images_to_article_folder(images, article_path, article_id, slug)
# Replace image paths with file names
content = replace_paths_with_filenames(content)
# Compose the final content using the YAML metadata and the processed content
final_content = f"{yaml_metadata}\n\n{content}"
# Write the final content to the output file
output_file = f"{article_path}/{article_id}-{slug}/index.md"
with open(output_file, 'w') as file:
file.write(final_content)
if __name__ == "__main__":
posts_filenames = get_posts_filenames(INPUT_FOLDER)
process_files(posts_filenames)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment