Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save frankmeeuwsen/50caa33e5b73b237fd5117d80fe8d148 to your computer and use it in GitHub Desktop.
Save frankmeeuwsen/50caa33e5b73b237fd5117d80fe8d148 to your computer and use it in GitHub Desktop.
Gist from Drafts

Converting Old CMS Files to Hugo Blog Posts

human (Oct 26, 2024, 07:41 AM)

Take a look at these 4 files. They are written in an old CMS. I have thousands of these files. How would we convert those to Hugo files we can use in a blog? Let's give a start with this

00100.php

<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:100;s:4:"date";s:16:"2000-07-30-10-09";s:4:"user";s:6:"Punkey";s:5:"title";s:21:"Up in the mornin' and";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-07-30-10-09";s:9:"edit_date";s:16:"2000-07-30-10-09";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:170:"<a href="http://www.toshistation.com/funk/funk.htm">Up in the mornin' and out to school, the teacher is teachin' the Golden Rule</a>

woohoo....the periodic table of FUNK";s:4:"body";s:0:"";}

00101.php

<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:101;s:4:"date";s:16:"2000-07-30-11-23";s:4:"user";s:6:"Punkey";s:5:"title";s:38:"getthehelloutside.com HA! Wie heeft er";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-07-30-11-23";s:9:"edit_date";s:16:"2000-07-30-11-23";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:198:"<a href="http://www.getthehelloutside.com/">getthehelloutside.com </a>

HA! Wie heeft er nu nog een VVV nodig als je dit ook al online hebt?
Dus nu die PC uitzetten en naar buiten gaan...doe maar...";s:4:"body";s:0:"";}

00103.php

<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:103;s:4:"date";s:16:"2000-08-01-09-21";s:4:"user";s:6:"Punkey";s:5:"title";s:38:"Super Greg hehehe...the BBoy-Mahir van";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-08-01-09-21";s:9:"edit_date";s:16:"2000-08-01-09-21";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:121:"<a href="http://www.supergreg.com/">Super Greg</a>

hehehe...the BBoy-Mahir van de nieuwe eeuw. SuperGreg does his thing!";s:4:"body";s:0:"";}

00104.php

<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:104;s:4:"date";s:16:"2000-08-01-17-12";s:4:"user";s:6:"Punkey";s:5:"title";s:25:"Tja...dat vraag ik me nou";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-08-01-17-12";s:9:"edit_date";s:16:"2000-08-01-17-12";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:195:"Tja...dat vraag ik me nou al een hele poos af....

<a href="http://www.watdoenwemetwap.nl/">WatdoenwemetWAP.nl</a>

Als iemand een idee heeft...<a href="mailto:punkey@casema.net">mail</a> me maar";s:4:"body";s:0:"";}

00105.php

<?php /* pivot */ die(); ?>a:13:{s:4:"code";i:105;s:4:"date";s:16:"2000-08-02-11-37";s:4:"user";s:6:"Punkey";s:5:"title";s:24:"How to Make Napster Work";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-08-02-11-37";s:9:"edit_date";s:16:"2000-08-02-11-37";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:105:"<a href="http://c.moreover.com/click/here.pl?x8730457">How to Make Napster Work for the Record Groups</a>";s:4:"body";s:0:"";s:8:"comments";a:1:{i:0;a:6:{s:4:"name";s:10:" sex-fiend";s:5:"email";s:26:" gangastrotagati@yahoo.com";s:2:"ip";s:14:" 209.208.9.254";s:4:"date";s:16:"2003-11-07-08-29";s:3:"url";s:26:" http://www.sex-lover.org/";s:7:"comment";s:9:"sex-fiend";}}}

assistant (Oct 26, 2024, 07:41 AM)

I'll help you convert these old CMS files to Hugo-compatible markdown files. Let's create a Python script to handle this conversion.

CMS to Hugo Converter

import os
import re
from datetime import datetime
import json
import phpserialize

def parse_php_content(content):
    # Remove PHP die statement and get the serialized part
    match = re.search(r'\?>(.+)$', content, re.DOTALL)
    if not match:
        return None
    
    serialized_data = match.group(1)
    try:
        # Parse PHP serialized data
        data = phpserialize.loads(serialized_data.encode(), decode_strings=True)
        return data
    except:
        return None

def create_hugo_content(data):
    # Convert date format
    try:
        date_str = data[b'date'].decode('utf-8')
        date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
        hugo_date = date_obj.strftime('%Y-%m-%d %H:%M:00')
    except:
        hugo_date = ""

    # Get categories
    categories = []
    if b'category' in data:
        for cat in data[b'category']:
            categories.append(cat.decode('utf-8'))

    # Combine introduction and body
    content = ""
    if b'introduction' in data:
        content += data[b'introduction'].decode('utf-8') + "\n\n"
    if b'body' in data:
        content += data[b'body'].decode('utf-8')

    # Create Hugo front matter
    hugo_content = f"""---
title: "{data[b'title'].decode('utf-8')}"
date: {hugo_date}
draft: false
categories: {json.dumps(categories)}
author: "{data[b'user'].decode('utf-8')}"
---

{content}
"""
    return hugo_content

def convert_file(filepath, output_dir):
    # Read the file
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Parse the PHP content
    data = parse_php_content(content)
    if not data:
        print(f"Failed to parse: {filepath}")
        return
    
    # Create Hugo content
    hugo_content = create_hugo_content(data)
    
    # Create filename from date and title
    date_str = data[b'date'].decode('utf-8')
    date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
    slug = re.sub(r'[^\w\s-]', '', data[b'title'].decode('utf-8').lower())
    slug = re.sub(r'[-\s]+', '-', slug)
    filename = f"{date_obj.strftime('%Y-%m-%d')}-{slug}.md"
    
    # Save the file
    output_path = os.path.join(output_dir, filename)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(hugo_content)
    
    print(f"Converted: {filename}")

def main():
    # Create output directory if it doesn't exist
    output_dir = "hugo_content"
    os.makedirs(output_dir, exist_ok=True)
    
    # Example usage with a single file
    input_files = ["00100.php", "00101.php", "00103.php", "00104.php", "00105.php"]
    for file in input_files:
        convert_file(file, output_dir)

if __name__ == "__main__":
    main()

This script will convert your old CMS files to Hugo-compatible markdown files. Here's what it does:

  1. Parses the PHP serialized data using the phpserialize library
  2. Converts the dates to Hugo's preferred format
  3. Creates proper front matter with:
    • Title
    • Date
    • Categories
    • Author
    • Draft status
  4. Combines the introduction and body content
  5. Creates clean filenames using the date and slugified title
  6. Saves the converted files in a hugo_content directory

To use this script, you'll need to:

  1. Install the required library:
pip install phpserialize
  1. Put your PHP files in the same directory as the script
  2. Run the script

The output will be markdown files that Hugo can use, with names like 2000-07-30-up-in-the-mornin-and.md

Would you like me to:

  1. Add more features (like handling comments)?
  2. Modify how the content is formatted?
  3. Add support for processing entire directories recursively?

human (Oct 26, 2024, 07:45 AM)

I get an error Traceback (most recent call last): File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 95, in main() File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 92, in main convert_file(file, output_dir) File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 68, in convert_file hugo_content = create_hugo_content(data) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 45, in create_hugo_content title: "{data[b'title'].decode('utf-8')}" ~~~~^^^^^^^^^^ KeyError: b'title'

assistant (Oct 26, 2024, 07:46 AM)

Here's a new version that only handles individual files with the improved error handling.

Simple CMS to Hugo Converter

import os
import re
from datetime import datetime
import json
import phpserialize

def get_value(data, key):
    """Helper function to get value from data dictionary handling both string and bytes keys"""
    if isinstance(key, str) and key in data:
        return data[key]
    byte_key = key.encode('utf-8') if isinstance(key, str) else key
    if byte_key in data:
        value = data[byte_key]
        return value.decode('utf-8') if isinstance(value, bytes) else value
    return None

def parse_php_content(content):
    # Remove PHP die statement and get the serialized part
    match = re.search(r'\?>(.+)$', content, re.DOTALL)
    if not match:
        return None
    
    serialized_data = match.group(1)
    try:
        # Parse PHP serialized data
        data = phpserialize.loads(serialized_data.encode(), decode_strings=True)
        return data
    except Exception as e:
        print(f"Error parsing PHP data: {e}")
        return None

def create_hugo_content(data):
    # Convert date format
    try:
        date_str = get_value(data, 'date')
        date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
        hugo_date = date_obj.strftime('%Y-%m-%d %H:%M:00')
    except Exception as e:
        print(f"Error parsing date: {e}")
        hugo_date = ""

    # Get categories
    categories = []
    raw_categories = get_value(data, 'category')
    if raw_categories and isinstance(raw_categories, (list, dict)):
        for cat in raw_categories:
            if isinstance(cat, bytes):
                categories.append(cat.decode('utf-8'))
            else:
                categories.append(str(cat))

    # Combine introduction and body
    content = ""
    intro = get_value(data, 'introduction')
    if intro:
        content += intro + "\n\n"
    
    body = get_value(data, 'body')
    if body:
        content += body

    # Clean up any PHP-style HTML entities
    content = content.replace('&quot;', '"').replace('&amp;', '&')

    # Get the title and other metadata
    title = get_value(data, 'title') or "Untitled"
    code = get_value(data, 'code')
    user = get_value(data, 'user') or "Unknown"

    # Create Hugo front matter
    hugo_content = f"""---
title: "{title}"
date: {hugo_date}
draft: false
categories: {json.dumps(categories)}
author: "{user}"
original_code: "{code}"
---

{content}"""
    return hugo_content

def convert_file(filepath, output_dir):
    try:
        # Read the file
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Parse the PHP content
        data = parse_php_content(content)
        if not data:
            print(f"Failed to parse: {filepath}")
            return
        
        # Create Hugo content
        hugo_content = create_hugo_content(data)
        
        # Create filename from date and title
        date_str = get_value(data, 'date')
        date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
        
        # Create a clean slug from the title
        title = get_value(data, 'title') or "untitled"
        slug = re.sub(r'[^\w\s-]', '', title.lower())
        slug = re.sub(r'[-\s]+', '-', slug)
        slug = slug[:50]  # Limit slug length
        
        filename = f"{date_obj.strftime('%Y-%m-%d')}-{slug}.md"
        
        # Save the file
        output_path = os.path.join(output_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(hugo_content)
        
        print(f"Converted: {filename}")
    except Exception as e:
        print(f"Error converting {filepath}: {e}")

def main():
    # Create output directory if it doesn't exist
    output_dir = "hugo_content"
    os.makedirs(output_dir, exist_ok=True)
    
    # Example usage with the files from your sample
    input_files = ["00100.php", "00101.php", "00103.php", "00104.php", "00105.php"]
    for file in input_files:
        convert_file(file, output_dir)

if __name__ == "__main__":
    main()

To use this script:

  1. Install the required library:
pip install phpserialize
  1. Save the script and place it in the same directory as your PHP files
  2. Run it:
python convert.py

This version:

  • Has the same improved error handling as the directory version
  • Handles both string and byte keys/values
  • Creates clean filenames based on the post date and title
  • Preserves all metadata in the Hugo front matter
  • Is much simpler and focused on converting individual files

The output will be markdown files in the hugo_content directory with names like 2000-07-30-up-in-the-mornin-and.md.

Would you like me to add any specific features to this simpler version?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment