A hacky script to migrate my 2014 Wordpress content to Hugo.
# coding=utf-8 | |
''' | |
I wrote this hacky script to migrate my Wordpress content from 2014 | |
to Hugo posts. It requires: | |
* Python 2.7. | |
* A trivial Django 1.8 project with the 'the-real-django-wordpress' | |
app/package installed (it's on PyPI). It's assumed that the project | |
has been configured to allow this app to access your Wordpress | |
content. | |
* The 'toml' package, for writing toml front-matter. | |
It should be noted that I used some of these | |
slightly-outdated-as-of-2017 technologies because my most recent | |
SQL dump of my WordPress site was from 2014. So I was specifically | |
looking for tools that would understand a WordPress database | |
schema from that era. | |
''' | |
import os | |
import toml | |
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "recover.settings") | |
import django | |
django.setup() | |
from wordpress.models import Post | |
HUGO_DIR = os.path.join('hugo') | |
HUGO_POSTS_DIR = os.path.join(HUGO_DIR, 'content', 'post') | |
categories = set() | |
import codecs | |
def charref(s): | |
''' | |
Given a unicode character, return an XML character entity | |
reference representing it. | |
''' | |
return unicode(s.encode('ascii', 'xmlcharrefreplace')) | |
# Something weird was going on with my Wordpress SQL dump, where | |
# some posts had non-ASCII characters that were encoded as UTF-8 | |
# but mis-interpreted as a different character set. It was confusing | |
# to figure out how to fix them purely through character set | |
# conversions; once I collected some data and found out that there | |
# were only about a dozen unique garbled sequences, though, it was | |
# easy to just create a conversion map. | |
conversion_map = { | |
u"\u00e2\u20ac\u2122": u"'", | |
u"\u00e2\u20ac\u02dc": u"'", | |
u"\u00e2\u20ac\u2122\u00e2\u20ac\u009d": u" ", | |
u"\u00e2\u20ac\u0153": u'"', | |
u"\u00e2\u20ac\u201c": u"—", | |
u"\u00e2\u20ac\u201d": u"—", | |
u"\u00e2\u20ac\u009d": u'"', | |
u"\u00e2\u20ac\u0161\u00c3\u201e\u00c3\u00ac": u" ", | |
u"\u00c3\u00a7": charref(u"ç"), | |
u"\u00c4\u2021": u"c", | |
u"\u00c3\u00af": charref(u"ï"), | |
} | |
# A mapping from unrecognized character sequences to | |
# (slug, occurrence w/ surrounding text) tuples logging information about | |
# the occurrences of a particular sequence. | |
bad_chars = {} | |
# Global variable used to keep track of what the current slug being | |
# processed is. | |
curr_slug = '' | |
def myfixerupper(error): | |
''' | |
Python error codec that attempts to fix-up non-ASCII character | |
sequences through the conversion map; if no entry is found in the | |
map, an entry is logged in `bad_chars` and the unrecognized byte | |
sequence is replaced with 'LOL'. | |
''' | |
invalid_chars = error.object[error.start:error.end] | |
if invalid_chars in conversion_map: | |
return (conversion_map[invalid_chars], error.end) | |
if invalid_chars not in bad_chars: | |
bad_chars[invalid_chars] = [] | |
bad_chars[invalid_chars].append( | |
(curr_slug, error.object[error.start - 10:error.end + 10]) | |
) | |
return (u'LOL', error.end) | |
codecs.register_error('myfixerupper', myfixerupper) | |
# The fields on `post` were found by perusing: | |
# https://github.com/istrategylabs/django-wordpress/blob/master/wordpress/models.py | |
for post in Post.objects.published(): | |
print("Processing {}...".format(post.slug)) | |
filename = os.path.join(HUGO_POSTS_DIR, post.slug + '.md') | |
with open(filename, 'w') as outfile: | |
def write(content): | |
if isinstance(content, unicode): | |
content = content.encode('utf-8') | |
outfile.write(content) | |
pubdate = post.post_date.date() | |
metadata = { | |
'date': pubdate.isoformat(), | |
'title': post.title, | |
'categories': [ | |
cat.name.replace('&', '&') for cat in post.categories() | |
], | |
'url': '{}/{}'.format( | |
pubdate.strftime('%Y/%m'), | |
post.slug, | |
) | |
} | |
categories.update(metadata['categories']) | |
write(u'+++\n') | |
write(toml.dumps(metadata)) | |
write(u'+++\n\n') | |
curr_slug = post.slug | |
# I made a simple Hugo shortcode that just inserts the | |
# location of the directory where I put all my old wordpress | |
# uploads, so that the location of those uploads stays decoupled | |
# from the blog content. | |
# | |
# The following replacement just swaps out absolute URLs to | |
# my uploads with that shortcode. | |
content = post.content.replace( | |
u'http://www.toolness.com/wp/wp-content/uploads', | |
u'{{< wordpress-upload-base-url >}}', | |
).encode('ascii', 'myfixerupper') | |
write(content) | |
print("Done.") | |
print("Final categories: {}".format(categories)) | |
import json | |
print("writing {} bad characters to 'bad_chars.json'.".format(len(bad_chars))) | |
with open('bad_chars.json', 'w') as f: | |
json.dump(bad_chars, f, indent=4, separators=(',', ': ')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment