Last active
May 14, 2017 12:10
-
-
Save toolness/f97989f7c4738bf5869f3c02aa2aa6ac to your computer and use it in GitHub Desktop.
A hacky script to migrate my 2014 Wordpress content to Hugo.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
''' | |
I wrote this hacky script to migrate my Wordpress content from 2014 | |
to Hugo posts. It requires: | |
* Python 2.7. | |
* A trivial Django 1.8 project with the 'the-real-django-wordpress' | |
app/package installed (it's on PyPI). It's assumed that the project | |
has been configured to allow this app to access your Wordpress | |
content. | |
* The 'toml' package, for writing toml front-matter. | |
It should be noted that I used some of these | |
slightly-outdated-as-of-2017 technologies because my most recent | |
SQL dump of my WordPress site was from 2014. So I was specifically | |
looking for tools that would understand a WordPress database | |
schema from that era. | |
''' | |
import os | |
import toml | |
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "recover.settings") | |
import django | |
django.setup() | |
from wordpress.models import Post | |
HUGO_DIR = os.path.join('hugo') | |
HUGO_POSTS_DIR = os.path.join(HUGO_DIR, 'content', 'post') | |
categories = set() | |
import codecs | |
def charref(s): | |
''' | |
Given a unicode character, return an XML character entity | |
reference representing it. | |
''' | |
return unicode(s.encode('ascii', 'xmlcharrefreplace')) | |
# Something weird was going on with my Wordpress SQL dump, where | |
# some posts had non-ASCII characters that were encoded as UTF-8 | |
# but mis-interpreted as a different character set. It was confusing | |
# to figure out how to fix them purely through character set | |
# conversions; once I collected some data and found out that there | |
# were only about a dozen unique garbled sequences, though, it was | |
# easy to just create a conversion map. | |
conversion_map = { | |
u"\u00e2\u20ac\u2122": u"'", | |
u"\u00e2\u20ac\u02dc": u"'", | |
u"\u00e2\u20ac\u2122\u00e2\u20ac\u009d": u" ", | |
u"\u00e2\u20ac\u0153": u'"', | |
u"\u00e2\u20ac\u201c": u"—", | |
u"\u00e2\u20ac\u201d": u"—", | |
u"\u00e2\u20ac\u009d": u'"', | |
u"\u00e2\u20ac\u0161\u00c3\u201e\u00c3\u00ac": u" ", | |
u"\u00c3\u00a7": charref(u"ç"), | |
u"\u00c4\u2021": u"c", | |
u"\u00c3\u00af": charref(u"ï"), | |
} | |
# A mapping from unrecognized character sequences to | |
# (slug, occurrence w/ surrounding text) tuples logging information about | |
# the occurrences of a particular sequence. | |
bad_chars = {} | |
# Global variable used to keep track of what the current slug being | |
# processed is. | |
curr_slug = '' | |
def myfixerupper(error): | |
''' | |
Python error codec that attempts to fix-up non-ASCII character | |
sequences through the conversion map; if no entry is found in the | |
map, an entry is logged in `bad_chars` and the unrecognized byte | |
sequence is replaced with 'LOL'. | |
''' | |
invalid_chars = error.object[error.start:error.end] | |
if invalid_chars in conversion_map: | |
return (conversion_map[invalid_chars], error.end) | |
if invalid_chars not in bad_chars: | |
bad_chars[invalid_chars] = [] | |
bad_chars[invalid_chars].append( | |
(curr_slug, error.object[error.start - 10:error.end + 10]) | |
) | |
return (u'LOL', error.end) | |
codecs.register_error('myfixerupper', myfixerupper) | |
# The fields on `post` were found by perusing: | |
# https://github.com/istrategylabs/django-wordpress/blob/master/wordpress/models.py | |
for post in Post.objects.published(): | |
print("Processing {}...".format(post.slug)) | |
filename = os.path.join(HUGO_POSTS_DIR, post.slug + '.md') | |
with open(filename, 'w') as outfile: | |
def write(content): | |
if isinstance(content, unicode): | |
content = content.encode('utf-8') | |
outfile.write(content) | |
pubdate = post.post_date.date() | |
metadata = { | |
'date': pubdate.isoformat(), | |
'title': post.title, | |
'categories': [ | |
cat.name.replace('&', '&') for cat in post.categories() | |
], | |
'url': '{}/{}'.format( | |
pubdate.strftime('%Y/%m'), | |
post.slug, | |
) | |
} | |
categories.update(metadata['categories']) | |
write(u'+++\n') | |
write(toml.dumps(metadata)) | |
write(u'+++\n\n') | |
curr_slug = post.slug | |
# I made a simple Hugo shortcode that just inserts the | |
# location of the directory where I put all my old wordpress | |
# uploads, so that the location of those uploads stays decoupled | |
# from the blog content. | |
# | |
# The following replacement just swaps out absolute URLs to | |
# my uploads with that shortcode. | |
content = post.content.replace( | |
u'http://www.toolness.com/wp/wp-content/uploads', | |
u'{{< wordpress-upload-base-url >}}', | |
).encode('ascii', 'myfixerupper') | |
write(content) | |
print("Done.") | |
print("Final categories: {}".format(categories)) | |
import json | |
print("writing {} bad characters to 'bad_chars.json'.".format(len(bad_chars))) | |
with open('bad_chars.json', 'w') as f: | |
json.dump(bad_chars, f, indent=4, separators=(',', ': ')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment