human (Oct 26, 2024, 07:41 AM)
Take a look at these 4 files. They are written in an old CMS. I have thousands of these files. How would we convert those to Hugo files we can use in a blog? Let's give a start with this
00100.php
<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:100;s:4:"date";s:16:"2000-07-30-10-09";s:4:"user";s:6:"Punkey";s:5:"title";s:21:"Up in the mornin' and";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-07-30-10-09";s:9:"edit_date";s:16:"2000-07-30-10-09";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:170:"<a href="http://www.toshistation.com/funk/funk.htm">Up in the mornin' and out to school, the teacher is teachin' the Golden Rule</a>
woohoo....the periodic table of FUNK";s:4:"body";s:0:"";}
00101.php
<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:101;s:4:"date";s:16:"2000-07-30-11-23";s:4:"user";s:6:"Punkey";s:5:"title";s:38:"getthehelloutside.com HA! Wie heeft er";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-07-30-11-23";s:9:"edit_date";s:16:"2000-07-30-11-23";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:198:"<a href="http://www.getthehelloutside.com/">getthehelloutside.com </a>
HA! Wie heeft er nu nog een VVV nodig als je dit ook al online hebt?
Dus nu die PC uitzetten en naar buiten gaan...doe maar...";s:4:"body";s:0:"";}
00103.php
<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:103;s:4:"date";s:16:"2000-08-01-09-21";s:4:"user";s:6:"Punkey";s:5:"title";s:38:"Super Greg hehehe...the BBoy-Mahir van";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-08-01-09-21";s:9:"edit_date";s:16:"2000-08-01-09-21";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:121:"<a href="http://www.supergreg.com/">Super Greg</a>
hehehe...the BBoy-Mahir van de nieuwe eeuw. SuperGreg does his thing!";s:4:"body";s:0:"";}
00104.php
<?php /* pivot */ die(); ?>a:12:{s:4:"code";i:104;s:4:"date";s:16:"2000-08-01-17-12";s:4:"user";s:6:"Punkey";s:5:"title";s:25:"Tja...dat vraag ik me nou";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-08-01-17-12";s:9:"edit_date";s:16:"2000-08-01-17-12";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:195:"Tja...dat vraag ik me nou al een hele poos af....
<a href="http://www.watdoenwemetwap.nl/">WatdoenwemetWAP.nl</a>
Als iemand een idee heeft...<a href="mailto:punkey@casema.net">mail</a> me maar";s:4:"body";s:0:"";}
00105.php
<?php /* pivot */ die(); ?>a:13:{s:4:"code";i:105;s:4:"date";s:16:"2000-08-02-11-37";s:4:"user";s:6:"Punkey";s:5:"title";s:24:"How to Make Napster Work";s:6:"status";s:7:"publish";s:14:"allow_comments";s:1:"1";s:10:"convert_lb";i:1;s:12:"publish_date";s:16:"2000-08-02-11-37";s:9:"edit_date";s:16:"2000-08-02-11-37";s:8:"category";a:1:{i:0;s:11:"Old Blogger";}s:12:"introduction";s:105:"<a href="http://c.moreover.com/click/here.pl?x8730457">How to Make Napster Work for the Record Groups</a>";s:4:"body";s:0:"";s:8:"comments";a:1:{i:0;a:6:{s:4:"name";s:10:" sex-fiend";s:5:"email";s:26:" gangastrotagati@yahoo.com";s:2:"ip";s:14:" 209.208.9.254";s:4:"date";s:16:"2003-11-07-08-29";s:3:"url";s:26:" http://www.sex-lover.org/";s:7:"comment";s:9:"sex-fiend";}}}
assistant (Oct 26, 2024, 07:41 AM)
I'll help you convert these old CMS files to Hugo-compatible markdown files. Let's create a Python script to handle this conversion.
import os
import re
from datetime import datetime
import json
import phpserialize
def parse_php_content(content):
# Remove PHP die statement and get the serialized part
match = re.search(r'\?>(.+)$', content, re.DOTALL)
if not match:
return None
serialized_data = match.group(1)
try:
# Parse PHP serialized data
data = phpserialize.loads(serialized_data.encode(), decode_strings=True)
return data
except:
return None
def create_hugo_content(data):
# Convert date format
try:
date_str = data[b'date'].decode('utf-8')
date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
hugo_date = date_obj.strftime('%Y-%m-%d %H:%M:00')
except:
hugo_date = ""
# Get categories
categories = []
if b'category' in data:
for cat in data[b'category']:
categories.append(cat.decode('utf-8'))
# Combine introduction and body
content = ""
if b'introduction' in data:
content += data[b'introduction'].decode('utf-8') + "\n\n"
if b'body' in data:
content += data[b'body'].decode('utf-8')
# Create Hugo front matter
hugo_content = f"""---
title: "{data[b'title'].decode('utf-8')}"
date: {hugo_date}
draft: false
categories: {json.dumps(categories)}
author: "{data[b'user'].decode('utf-8')}"
---
{content}
"""
return hugo_content
def convert_file(filepath, output_dir):
# Read the file
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Parse the PHP content
data = parse_php_content(content)
if not data:
print(f"Failed to parse: {filepath}")
return
# Create Hugo content
hugo_content = create_hugo_content(data)
# Create filename from date and title
date_str = data[b'date'].decode('utf-8')
date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
slug = re.sub(r'[^\w\s-]', '', data[b'title'].decode('utf-8').lower())
slug = re.sub(r'[-\s]+', '-', slug)
filename = f"{date_obj.strftime('%Y-%m-%d')}-{slug}.md"
# Save the file
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(hugo_content)
print(f"Converted: {filename}")
def main():
# Create output directory if it doesn't exist
output_dir = "hugo_content"
os.makedirs(output_dir, exist_ok=True)
# Example usage with a single file
input_files = ["00100.php", "00101.php", "00103.php", "00104.php", "00105.php"]
for file in input_files:
convert_file(file, output_dir)
if __name__ == "__main__":
main()
This script will convert your old CMS files to Hugo-compatible markdown files. Here's what it does:
- Parses the PHP serialized data using the
phpserialize
library - Converts the dates to Hugo's preferred format
- Creates proper front matter with:
- Title
- Date
- Categories
- Author
- Draft status
- Combines the introduction and body content
- Creates clean filenames using the date and slugified title
- Saves the converted files in a
hugo_content
directory
To use this script, you'll need to:
- Install the required library:
pip install phpserialize
- Put your PHP files in the same directory as the script
- Run the script
The output will be markdown files that Hugo can use, with names like 2000-07-30-up-in-the-mornin-and.md
Would you like me to:
- Add more features (like handling comments)?
- Modify how the content is formatted?
- Add support for processing entire directories recursively?
human (Oct 26, 2024, 07:45 AM)
I get an error Traceback (most recent call last): File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 95, in main() File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 92, in main convert_file(file, output_dir) File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 68, in convert_file hugo_content = create_hugo_content(data) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/fmeeuwsen/Documents/Hobbies/Gitprojecten/p2h/convert.py", line 45, in create_hugo_content title: "{data[b'title'].decode('utf-8')}" ~~~~^^^^^^^^^^ KeyError: b'title'
assistant (Oct 26, 2024, 07:46 AM)
Here's a new version that only handles individual files with the improved error handling.
import os
import re
from datetime import datetime
import json
import phpserialize
def get_value(data, key):
"""Helper function to get value from data dictionary handling both string and bytes keys"""
if isinstance(key, str) and key in data:
return data[key]
byte_key = key.encode('utf-8') if isinstance(key, str) else key
if byte_key in data:
value = data[byte_key]
return value.decode('utf-8') if isinstance(value, bytes) else value
return None
def parse_php_content(content):
# Remove PHP die statement and get the serialized part
match = re.search(r'\?>(.+)$', content, re.DOTALL)
if not match:
return None
serialized_data = match.group(1)
try:
# Parse PHP serialized data
data = phpserialize.loads(serialized_data.encode(), decode_strings=True)
return data
except Exception as e:
print(f"Error parsing PHP data: {e}")
return None
def create_hugo_content(data):
# Convert date format
try:
date_str = get_value(data, 'date')
date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
hugo_date = date_obj.strftime('%Y-%m-%d %H:%M:00')
except Exception as e:
print(f"Error parsing date: {e}")
hugo_date = ""
# Get categories
categories = []
raw_categories = get_value(data, 'category')
if raw_categories and isinstance(raw_categories, (list, dict)):
for cat in raw_categories:
if isinstance(cat, bytes):
categories.append(cat.decode('utf-8'))
else:
categories.append(str(cat))
# Combine introduction and body
content = ""
intro = get_value(data, 'introduction')
if intro:
content += intro + "\n\n"
body = get_value(data, 'body')
if body:
content += body
# Clean up any PHP-style HTML entities
content = content.replace('"', '"').replace('&', '&')
# Get the title and other metadata
title = get_value(data, 'title') or "Untitled"
code = get_value(data, 'code')
user = get_value(data, 'user') or "Unknown"
# Create Hugo front matter
hugo_content = f"""---
title: "{title}"
date: {hugo_date}
draft: false
categories: {json.dumps(categories)}
author: "{user}"
original_code: "{code}"
---
{content}"""
return hugo_content
def convert_file(filepath, output_dir):
try:
# Read the file
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Parse the PHP content
data = parse_php_content(content)
if not data:
print(f"Failed to parse: {filepath}")
return
# Create Hugo content
hugo_content = create_hugo_content(data)
# Create filename from date and title
date_str = get_value(data, 'date')
date_obj = datetime.strptime(date_str, '%Y-%m-%d-%H-%M')
# Create a clean slug from the title
title = get_value(data, 'title') or "untitled"
slug = re.sub(r'[^\w\s-]', '', title.lower())
slug = re.sub(r'[-\s]+', '-', slug)
slug = slug[:50] # Limit slug length
filename = f"{date_obj.strftime('%Y-%m-%d')}-{slug}.md"
# Save the file
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(hugo_content)
print(f"Converted: {filename}")
except Exception as e:
print(f"Error converting {filepath}: {e}")
def main():
# Create output directory if it doesn't exist
output_dir = "hugo_content"
os.makedirs(output_dir, exist_ok=True)
# Example usage with the files from your sample
input_files = ["00100.php", "00101.php", "00103.php", "00104.php", "00105.php"]
for file in input_files:
convert_file(file, output_dir)
if __name__ == "__main__":
main()
To use this script:
- Install the required library:
pip install phpserialize
- Save the script and place it in the same directory as your PHP files
- Run it:
python convert.py
This version:
- Has the same improved error handling as the directory version
- Handles both string and byte keys/values
- Creates clean filenames based on the post date and title
- Preserves all metadata in the Hugo front matter
- Is much simpler and focused on converting individual files
The output will be markdown files in the hugo_content
directory with names like 2000-07-30-up-in-the-mornin-and.md
.
Would you like me to add any specific features to this simpler version?