Skip to content

Instantly share code, notes, and snippets.

@mtik00
Created July 24, 2015 18:45
Show Gist options
  • Save mtik00/75c8f555b49365395e32 to your computer and use it in GitHub Desktop.
Save mtik00/75c8f555b49365395e32 to your computer and use it in GitHub Desktop.
This script is used to convert a WordPress XML dump to Hugo-formatted posts.
#!/usr/bin/env python2.7
"""
This script is used to convert a WordPress XML dump to Hugo-formatted posts.
NOTE: The WP post data is kept as-is (probably HTML). It is not converted to
Markdown. This is to reduce the amount of "fixing" one has to do after the
data is converted (e.g. line endings, links, etc). This is generally not an
issue since Markdown allows HTML.
The post Metadata is converted to TOML.
The posts are written as: <year>/<title>.md
where <year> is the year the post was written, and <title> is the WP title
with all non-word characters replaced with "-", and converted to lower case.
"""
# Imports ######################################################################
import os
import re
import time
import calendar
import xml.etree.ElementTree as ET
from distutils.version import LooseVersion
# Metadata #####################################################################
__author__ = "Timothy McFadden"
__creationDate__ = "07/24/2015"
__license__ = "MIT"
__version__ = "1.0.0dev"
# Globals ######################################################################
DEBUG = False
KNOWN_WP_VERSION = LooseVersion("4.2")
def wp_to_hugo_date(wp_date, tz_direction=-1):
"""Converts a UTC time string from the WordPress XML to a Hugo time string."""
date = time.strptime(wp_date, "%a, %d %b %Y %H:%M:%S +0000")
date = calendar.timegm(date)
ltime = time.localtime(date)
date = time.strftime("%Y-%m-%dT%H:%M:%S", ltime)
date += "%+03i:00" % (((time.timezone / 3600) - (1 * ltime.tm_isdst)) * tz_direction)
return ltime, date
def hugo_format(data):
result = ["+++"]
for heading in ["title", "date", "type"]:
result.append('%s = "%s"' % (heading, data[heading]))
result.append("tags = %s" % str(data["tags"]))
result.append("+++")
result.append("")
result.append(data["body"])
return "\n".join(result)
def wp_version_check(channel):
match = re.search("\?v=([\d\.]+)", channel.find("generator").text)
if not match:
print "WARNING: Could not find WP version in your XML."
print "...This script may not work"
raw_input("...press Enter to continue: ")
else:
wp_version = LooseVersion(match.group(1))
if wp_version < KNOWN_WP_VERSION:
print "WARNING: WP version in your XML (%s) is less than known good version (%s)!" % (wp_version, KNOWN_WP_VERSION)
print "...This script may not work"
raw_input("...press Enter to continue: ")
def convert_wp_xml(xml_path):
tree = ET.parse(xml_path)
# FYI: xml.etree doesn't support reading the namespaces, and I don't feel
# like requiring lxml.
nsmap = {
"excerpt": "http://wordpress.org/export/1.2/excerpt/",
"content": "http://purl.org/rss/1.0/modules/content/",
"wfw": "http://wellformedweb.org/CommentAPI/",
"dc": "http://purl.org/dc/elements/1.1/",
"wp": "http://wordpress.org/export/1.2/",
}
channel = tree.find("channel")
wp_version_check(channel)
for item in channel.findall("item"):
data = {
"tags": [],
"title": item.find("title").text,
"date": None,
"body": None,
"fpath": None,
"type": "post"
}
ltime, date = wp_to_hugo_date(item.find("pubDate").text)
data["date"] = date
data["tags"] = [x.attrib["nicename"] for x in item.findall("category")]
data["body"] = item.find("content:encoded", nsmap).text
fname = re.sub("\W+", "-", data["title"])
fname = re.sub("(-+)$", "", fname)
data["fname"] = "%s.md" % fname.lower()
data["fdir"] = os.path.abspath(os.path.join(".", str(ltime.tm_year)))
data["fpath"] = os.path.join(data["fdir"], data["fname"])
hugo_text = hugo_format(data)
if not os.path.isdir(data["fdir"]):
os.makedirs(data["fdir"])
with open(data["fpath"], "wb") as fh:
fh.write(hugo_text.encode('UTF-8'))
print "Created: %s/%s" % (str(ltime.tm_year), data["fname"])
if __name__ == '__main__':
import sys
if len(sys.argv) == 1:
print "Usage: python wp_to_hugo.py <wordpress XML file>"
sys.exit(1)
convert_wp_xml(sys.argv[1])
@clottman
Copy link

Used this today and it took a bit of modification to my XML (replacing a few dates that were year -0001 for some reason, and adding in a missing title (<title></title> became <title>?</title>), but it otherwise works great! Thank you for posting your solution.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment