A hacky script to migrate my 2014 Wordpress content to Hugo.
# coding=utf-8
I wrote this hacky script to migrate my Wordpress content from 2014
to Hugo posts. It requires:
* Python 2.7.
* A trivial Django 1.8 project with the 'the-real-django-wordpress'
app/package installed (it's on PyPI). It's assumed that the project
has been configured to allow this app to access your Wordpress
* The 'toml' package, for writing toml front-matter.
It should be noted that I used some of these
slightly-outdated-as-of-2017 technologies because my most recent
SQL dump of my WordPress site was from 2014. So I was specifically
looking for tools that would understand a WordPress database
schema from that era.
import os
import toml
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "recover.settings")
import django
from wordpress.models import Post
HUGO_DIR = os.path.join('hugo')
HUGO_POSTS_DIR = os.path.join(HUGO_DIR, 'content', 'post')
categories = set()
import codecs
def charref(s):
Given a unicode character, return an XML character entity
reference representing it.
return unicode(s.encode('ascii', 'xmlcharrefreplace'))
# Something weird was going on with my Wordpress SQL dump, where
# some posts had non-ASCII characters that were encoded as UTF-8
# but mis-interpreted as a different character set. It was confusing
# to figure out how to fix them purely through character set
# conversions; once I collected some data and found out that there
# were only about a dozen unique garbled sequences, though, it was
# easy to just create a conversion map.
conversion_map = {
u"\u00e2\u20ac\u2122": u"'",
u"\u00e2\u20ac\u02dc": u"'",
u"\u00e2\u20ac\u2122\u00e2\u20ac\u009d": u" ",
u"\u00e2\u20ac\u0153": u'"',
u"\u00e2\u20ac\u201c": u"—",
u"\u00e2\u20ac\u201d": u"—",
u"\u00e2\u20ac\u009d": u'"',
u"\u00e2\u20ac\u0161\u00c3\u201e\u00c3\u00ac": u" ",
u"\u00c3\u00a7": charref(u"ç"),
u"\u00c4\u2021": u"c",
u"\u00c3\u00af": charref(u"ï"),
# A mapping from unrecognized character sequences to
# (slug, occurrence w/ surrounding text) tuples logging information about
# the occurrences of a particular sequence.
bad_chars = {}
# Global variable used to keep track of what the current slug being
# processed is.
curr_slug = ''
def myfixerupper(error):
Python error codec that attempts to fix-up non-ASCII character
sequences through the conversion map; if no entry is found in the
map, an entry is logged in `bad_chars` and the unrecognized byte
sequence is replaced with 'LOL'.
invalid_chars = error.object[error.start:error.end]
if invalid_chars in conversion_map:
return (conversion_map[invalid_chars], error.end)
if invalid_chars not in bad_chars:
bad_chars[invalid_chars] = []
(curr_slug, error.object[error.start - 10:error.end + 10])
return (u'LOL', error.end)
codecs.register_error('myfixerupper', myfixerupper)
# The fields on `post` were found by perusing:
for post in Post.objects.published():
print("Processing {}...".format(post.slug))
filename = os.path.join(HUGO_POSTS_DIR, post.slug + '.md')
with open(filename, 'w') as outfile:
def write(content):
if isinstance(content, unicode):
content = content.encode('utf-8')
pubdate =
metadata = {
'date': pubdate.isoformat(),
'title': post.title,
'categories': ['&', '&') for cat in post.categories()
'url': '{}/{}'.format(
curr_slug = post.slug
# I made a simple Hugo shortcode that just inserts the
# location of the directory where I put all my old wordpress
# uploads, so that the location of those uploads stays decoupled
# from the blog content.
# The following replacement just swaps out absolute URLs to
# my uploads with that shortcode.
content = post.content.replace(
u'{{< wordpress-upload-base-url >}}',
).encode('ascii', 'myfixerupper')
print("Final categories: {}".format(categories))
import json
print("writing {} bad characters to 'bad_chars.json'.".format(len(bad_chars)))
with open('bad_chars.json', 'w') as f:
json.dump(bad_chars, f, indent=4, separators=(',', ': '))
