toolness/migrate_to_hugo.py

## migrate_to_hugo.py
# coding=utf-8

'''
I wrote this hacky script to migrate my Wordpress content from 2014
to Hugo posts. It requires:

* Python 2.7.

* A trivial Django 1.8 project with the 'the-real-django-wordpress'
  app/package installed (it's on PyPI). It's assumed that the project
  has been configured to allow this app to access your Wordpress
  content.

* The 'toml' package, for writing toml front-matter.

It should be noted that I used some of these
slightly-outdated-as-of-2017 technologies because my most recent
SQL dump of my WordPress site was from 2014. So I was specifically
looking for tools that would understand a WordPress database
schema from that era.
'''

import os
import toml

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "recover.settings")

import django

django.setup()

from wordpress.models import Post

HUGO_DIR = os.path.join('hugo')
HUGO_POSTS_DIR = os.path.join(HUGO_DIR, 'content', 'post')

categories = set()

import codecs

def charref(s):
    '''
    Given a unicode character, return an XML character entity
    reference representing it.
    '''

    return unicode(s.encode('ascii', 'xmlcharrefreplace'))

# Something weird was going on with my Wordpress SQL dump, where
# some posts had non-ASCII characters that were encoded as UTF-8
# but mis-interpreted as a different character set. It was confusing
# to figure out how to fix them purely through character set
# conversions; once I collected some data and found out that there
# were only about a dozen unique garbled sequences, though, it was
# easy to just create a conversion map.
conversion_map = {
    u"\u00e2\u20ac\u2122": u"'",
    u"\u00e2\u20ac\u02dc": u"'",
    u"\u00e2\u20ac\u2122\u00e2\u20ac\u009d": u" ",
    u"\u00e2\u20ac\u0153": u'"',
    u"\u00e2\u20ac\u201c": u"&mdash;",
    u"\u00e2\u20ac\u201d": u"&mdash;",
    u"\u00e2\u20ac\u009d": u'"',
    u"\u00e2\u20ac\u0161\u00c3\u201e\u00c3\u00ac": u" ",
    u"\u00c3\u00a7": charref(u"ç"),
    u"\u00c4\u2021": u"c",
    u"\u00c3\u00af": charref(u"ï"),
}

# A mapping from unrecognized character sequences to
# (slug, occurrence w/ surrounding text) tuples logging information about
# the occurrences of a particular sequence.
bad_chars = {}

# Global variable used to keep track of what the current slug being
# processed is.
curr_slug = ''

def myfixerupper(error):
    '''
    Python error codec that attempts to fix-up non-ASCII character
    sequences through the conversion map; if no entry is found in the
    map, an entry is logged in `bad_chars` and the unrecognized byte
    sequence is replaced with 'LOL'.
    '''

    invalid_chars = error.object[error.start:error.end]
    if invalid_chars in conversion_map:
        return (conversion_map[invalid_chars], error.end)
    if invalid_chars not in bad_chars:
        bad_chars[invalid_chars] = []
    bad_chars[invalid_chars].append(
        (curr_slug, error.object[error.start - 10:error.end + 10])
    )
    return (u'LOL', error.end)

codecs.register_error('myfixerupper', myfixerupper)

# The fields on `post` were found by perusing:
# https://github.com/istrategylabs/django-wordpress/blob/master/wordpress/models.py
for post in Post.objects.published():
    print("Processing {}...".format(post.slug))
    filename = os.path.join(HUGO_POSTS_DIR, post.slug + '.md')
    with open(filename, 'w') as outfile:
        def write(content):
            if isinstance(content, unicode):
                content = content.encode('utf-8')
            outfile.write(content)

        pubdate = post.post_date.date()
        metadata = {
            'date': pubdate.isoformat(),
            'title': post.title,
            'categories': [
               cat.name.replace('&amp;', '&') for cat in post.categories()
            ],
            'url': '{}/{}'.format(
                pubdate.strftime('%Y/%m'),
                post.slug,
            )
        }

        categories.update(metadata['categories'])

        write(u'+++\n')
        write(toml.dumps(metadata))
        write(u'+++\n\n')

        curr_slug = post.slug

        # I made a simple Hugo shortcode that just inserts the
        # location of the directory where I put all my old wordpress
        # uploads, so that the location of those uploads stays decoupled
        # from the blog content.
        #
        # The following replacement just swaps out absolute URLs to
        # my uploads with that shortcode.
        content = post.content.replace(
            u'http://www.toolness.com/wp/wp-content/uploads',
            u'{{< wordpress-upload-base-url >}}',
        ).encode('ascii', 'myfixerupper')

        write(content)

print("Done.")
print("Final categories: {}".format(categories))

import json

print("writing {} bad characters to 'bad_chars.json'.".format(len(bad_chars)))

with open('bad_chars.json', 'w') as f:
    json.dump(bad_chars, f, indent=4, separators=(',', ': '))
	# coding=utf-8

	'''
	I wrote this hacky script to migrate my Wordpress content from 2014
	to Hugo posts. It requires:

	* Python 2.7.

	* A trivial Django 1.8 project with the 'the-real-django-wordpress'
	app/package installed (it's on PyPI). It's assumed that the project
	has been configured to allow this app to access your Wordpress
	content.

	* The 'toml' package, for writing toml front-matter.

	It should be noted that I used some of these
	slightly-outdated-as-of-2017 technologies because my most recent
	SQL dump of my WordPress site was from 2014. So I was specifically
	looking for tools that would understand a WordPress database
	schema from that era.
	'''

	import os
	import toml

	os.environ.setdefault("DJANGO_SETTINGS_MODULE", "recover.settings")

	import django

	django.setup()

	from wordpress.models import Post

	HUGO_DIR = os.path.join('hugo')
	HUGO_POSTS_DIR = os.path.join(HUGO_DIR, 'content', 'post')

	categories = set()

	import codecs

	def charref(s):
	'''
	Given a unicode character, return an XML character entity
	reference representing it.
	'''

	return unicode(s.encode('ascii', 'xmlcharrefreplace'))

	# Something weird was going on with my Wordpress SQL dump, where
	# some posts had non-ASCII characters that were encoded as UTF-8
	# but mis-interpreted as a different character set. It was confusing
	# to figure out how to fix them purely through character set
	# conversions; once I collected some data and found out that there
	# were only about a dozen unique garbled sequences, though, it was
	# easy to just create a conversion map.
	conversion_map = {
	u"\u00e2\u20ac\u2122": u"'",
	u"\u00e2\u20ac\u02dc": u"'",
	u"\u00e2\u20ac\u2122\u00e2\u20ac\u009d": u" ",
	u"\u00e2\u20ac\u0153": u'"',
	u"\u00e2\u20ac\u201c": u"—",
	u"\u00e2\u20ac\u201d": u"—",
	u"\u00e2\u20ac\u009d": u'"',
	u"\u00e2\u20ac\u0161\u00c3\u201e\u00c3\u00ac": u" ",
	u"\u00c3\u00a7": charref(u"ç"),
	u"\u00c4\u2021": u"c",
	u"\u00c3\u00af": charref(u"ï"),
	}

	# A mapping from unrecognized character sequences to
	# (slug, occurrence w/ surrounding text) tuples logging information about
	# the occurrences of a particular sequence.
	bad_chars = {}

	# Global variable used to keep track of what the current slug being
	# processed is.
	curr_slug = ''

	def myfixerupper(error):
	'''
	Python error codec that attempts to fix-up non-ASCII character
	sequences through the conversion map; if no entry is found in the
	map, an entry is logged in `bad_chars` and the unrecognized byte
	sequence is replaced with 'LOL'.
	'''

	invalid_chars = error.object[error.start:error.end]
	if invalid_chars in conversion_map:
	return (conversion_map[invalid_chars], error.end)
	if invalid_chars not in bad_chars:
	bad_chars[invalid_chars] = []
	bad_chars[invalid_chars].append(
	(curr_slug, error.object[error.start - 10:error.end + 10])
	)
	return (u'LOL', error.end)

	codecs.register_error('myfixerupper', myfixerupper)

	# The fields on `post` were found by perusing:
	# https://github.com/istrategylabs/django-wordpress/blob/master/wordpress/models.py
	for post in Post.objects.published():
	print("Processing {}...".format(post.slug))
	filename = os.path.join(HUGO_POSTS_DIR, post.slug + '.md')
	with open(filename, 'w') as outfile:
	def write(content):
	if isinstance(content, unicode):
	content = content.encode('utf-8')
	outfile.write(content)

	pubdate = post.post_date.date()
	metadata = {
	'date': pubdate.isoformat(),
	'title': post.title,
	'categories': [
	cat.name.replace('&', '&') for cat in post.categories()
	],
	'url': '{}/{}'.format(
	pubdate.strftime('%Y/%m'),
	post.slug,
	)
	}

	categories.update(metadata['categories'])

	write(u'+++\n')
	write(toml.dumps(metadata))
	write(u'+++\n\n')

	curr_slug = post.slug

	# I made a simple Hugo shortcode that just inserts the
	# location of the directory where I put all my old wordpress
	# uploads, so that the location of those uploads stays decoupled
	# from the blog content.
	#
	# The following replacement just swaps out absolute URLs to
	# my uploads with that shortcode.
	content = post.content.replace(
	u'http://www.toolness.com/wp/wp-content/uploads',
	u'{{< wordpress-upload-base-url >}}',
	).encode('ascii', 'myfixerupper')

	write(content)

	print("Done.")
	print("Final categories: {}".format(categories))

	import json

	print("writing {} bad characters to 'bad_chars.json'.".format(len(bad_chars)))

	with open('bad_chars.json', 'w') as f:
	json.dump(bad_chars, f, indent=4, separators=(',', ': '))