Skip to content

Instantly share code, notes, and snippets.

@toolness
Last active May 14, 2017 12:10
Show Gist options
  • Save toolness/f97989f7c4738bf5869f3c02aa2aa6ac to your computer and use it in GitHub Desktop.
Save toolness/f97989f7c4738bf5869f3c02aa2aa6ac to your computer and use it in GitHub Desktop.
A hacky script to migrate my 2014 Wordpress content to Hugo.
# coding=utf-8
'''
I wrote this hacky script to migrate my Wordpress content from 2014
to Hugo posts. It requires:
* Python 2.7.
* A trivial Django 1.8 project with the 'the-real-django-wordpress'
app/package installed (it's on PyPI). It's assumed that the project
has been configured to allow this app to access your Wordpress
content.
* The 'toml' package, for writing toml front-matter.
It should be noted that I used some of these
slightly-outdated-as-of-2017 technologies because my most recent
SQL dump of my WordPress site was from 2014. So I was specifically
looking for tools that would understand a WordPress database
schema from that era.
'''
import os
import toml
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "recover.settings")
import django
django.setup()
from wordpress.models import Post
HUGO_DIR = os.path.join('hugo')
HUGO_POSTS_DIR = os.path.join(HUGO_DIR, 'content', 'post')
categories = set()
import codecs
def charref(s):
'''
Given a unicode character, return an XML character entity
reference representing it.
'''
return unicode(s.encode('ascii', 'xmlcharrefreplace'))
# Something weird was going on with my Wordpress SQL dump, where
# some posts had non-ASCII characters that were encoded as UTF-8
# but mis-interpreted as a different character set. It was confusing
# to figure out how to fix them purely through character set
# conversions; once I collected some data and found out that there
# were only about a dozen unique garbled sequences, though, it was
# easy to just create a conversion map.
conversion_map = {
u"\u00e2\u20ac\u2122": u"'",
u"\u00e2\u20ac\u02dc": u"'",
u"\u00e2\u20ac\u2122\u00e2\u20ac\u009d": u" ",
u"\u00e2\u20ac\u0153": u'"',
u"\u00e2\u20ac\u201c": u"—",
u"\u00e2\u20ac\u201d": u"—",
u"\u00e2\u20ac\u009d": u'"',
u"\u00e2\u20ac\u0161\u00c3\u201e\u00c3\u00ac": u" ",
u"\u00c3\u00a7": charref(u"ç"),
u"\u00c4\u2021": u"c",
u"\u00c3\u00af": charref(u"ï"),
}
# A mapping from unrecognized character sequences to
# (slug, occurrence w/ surrounding text) tuples logging information about
# the occurrences of a particular sequence.
bad_chars = {}
# Global variable used to keep track of what the current slug being
# processed is.
curr_slug = ''
def myfixerupper(error):
'''
Python error codec that attempts to fix-up non-ASCII character
sequences through the conversion map; if no entry is found in the
map, an entry is logged in `bad_chars` and the unrecognized byte
sequence is replaced with 'LOL'.
'''
invalid_chars = error.object[error.start:error.end]
if invalid_chars in conversion_map:
return (conversion_map[invalid_chars], error.end)
if invalid_chars not in bad_chars:
bad_chars[invalid_chars] = []
bad_chars[invalid_chars].append(
(curr_slug, error.object[error.start - 10:error.end + 10])
)
return (u'LOL', error.end)
codecs.register_error('myfixerupper', myfixerupper)
# The fields on `post` were found by perusing:
# https://github.com/istrategylabs/django-wordpress/blob/master/wordpress/models.py
for post in Post.objects.published():
print("Processing {}...".format(post.slug))
filename = os.path.join(HUGO_POSTS_DIR, post.slug + '.md')
with open(filename, 'w') as outfile:
def write(content):
if isinstance(content, unicode):
content = content.encode('utf-8')
outfile.write(content)
pubdate = post.post_date.date()
metadata = {
'date': pubdate.isoformat(),
'title': post.title,
'categories': [
cat.name.replace('&', '&') for cat in post.categories()
],
'url': '{}/{}'.format(
pubdate.strftime('%Y/%m'),
post.slug,
)
}
categories.update(metadata['categories'])
write(u'+++\n')
write(toml.dumps(metadata))
write(u'+++\n\n')
curr_slug = post.slug
# I made a simple Hugo shortcode that just inserts the
# location of the directory where I put all my old wordpress
# uploads, so that the location of those uploads stays decoupled
# from the blog content.
#
# The following replacement just swaps out absolute URLs to
# my uploads with that shortcode.
content = post.content.replace(
u'http://www.toolness.com/wp/wp-content/uploads',
u'{{< wordpress-upload-base-url >}}',
).encode('ascii', 'myfixerupper')
write(content)
print("Done.")
print("Final categories: {}".format(categories))
import json
print("writing {} bad characters to 'bad_chars.json'.".format(len(bad_chars)))
with open('bad_chars.json', 'w') as f:
json.dump(bad_chars, f, indent=4, separators=(',', ': '))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment