Created
July 24, 2015 18:45
-
-
Save mtik00/75c8f555b49365395e32 to your computer and use it in GitHub Desktop.
This script is used to convert a WordPress XML dump to Hugo-formatted posts.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
""" | |
This script is used to convert a WordPress XML dump to Hugo-formatted posts. | |
NOTE: The WP post data is kept as-is (probably HTML). It is not converted to | |
Markdown. This is to reduce the amount of "fixing" one has to do after the | |
data is converted (e.g. line endings, links, etc). This is generally not an | |
issue since Markdown allows HTML. | |
The post Metadata is converted to TOML. | |
The posts are written as: <year>/<title>.md | |
where <year> is the year the post was written, and <title> is the WP title | |
with all non-word characters replaced with "-", and converted to lower case. | |
""" | |
# Imports ###################################################################### | |
import os | |
import re | |
import time | |
import calendar | |
import xml.etree.ElementTree as ET | |
from distutils.version import LooseVersion | |
# Metadata ##################################################################### | |
__author__ = "Timothy McFadden" | |
__creationDate__ = "07/24/2015" | |
__license__ = "MIT" | |
__version__ = "1.0.0dev" | |
# Globals ###################################################################### | |
DEBUG = False | |
KNOWN_WP_VERSION = LooseVersion("4.2") | |
def wp_to_hugo_date(wp_date, tz_direction=-1): | |
"""Converts a UTC time string from the WordPress XML to a Hugo time string.""" | |
date = time.strptime(wp_date, "%a, %d %b %Y %H:%M:%S +0000") | |
date = calendar.timegm(date) | |
ltime = time.localtime(date) | |
date = time.strftime("%Y-%m-%dT%H:%M:%S", ltime) | |
date += "%+03i:00" % (((time.timezone / 3600) - (1 * ltime.tm_isdst)) * tz_direction) | |
return ltime, date | |
def hugo_format(data): | |
result = ["+++"] | |
for heading in ["title", "date", "type"]: | |
result.append('%s = "%s"' % (heading, data[heading])) | |
result.append("tags = %s" % str(data["tags"])) | |
result.append("+++") | |
result.append("") | |
result.append(data["body"]) | |
return "\n".join(result) | |
def wp_version_check(channel): | |
match = re.search("\?v=([\d\.]+)", channel.find("generator").text) | |
if not match: | |
print "WARNING: Could not find WP version in your XML." | |
print "...This script may not work" | |
raw_input("...press Enter to continue: ") | |
else: | |
wp_version = LooseVersion(match.group(1)) | |
if wp_version < KNOWN_WP_VERSION: | |
print "WARNING: WP version in your XML (%s) is less than known good version (%s)!" % (wp_version, KNOWN_WP_VERSION) | |
print "...This script may not work" | |
raw_input("...press Enter to continue: ") | |
def convert_wp_xml(xml_path): | |
tree = ET.parse(xml_path) | |
# FYI: xml.etree doesn't support reading the namespaces, and I don't feel | |
# like requiring lxml. | |
nsmap = { | |
"excerpt": "http://wordpress.org/export/1.2/excerpt/", | |
"content": "http://purl.org/rss/1.0/modules/content/", | |
"wfw": "http://wellformedweb.org/CommentAPI/", | |
"dc": "http://purl.org/dc/elements/1.1/", | |
"wp": "http://wordpress.org/export/1.2/", | |
} | |
channel = tree.find("channel") | |
wp_version_check(channel) | |
for item in channel.findall("item"): | |
data = { | |
"tags": [], | |
"title": item.find("title").text, | |
"date": None, | |
"body": None, | |
"fpath": None, | |
"type": "post" | |
} | |
ltime, date = wp_to_hugo_date(item.find("pubDate").text) | |
data["date"] = date | |
data["tags"] = [x.attrib["nicename"] for x in item.findall("category")] | |
data["body"] = item.find("content:encoded", nsmap).text | |
fname = re.sub("\W+", "-", data["title"]) | |
fname = re.sub("(-+)$", "", fname) | |
data["fname"] = "%s.md" % fname.lower() | |
data["fdir"] = os.path.abspath(os.path.join(".", str(ltime.tm_year))) | |
data["fpath"] = os.path.join(data["fdir"], data["fname"]) | |
hugo_text = hugo_format(data) | |
if not os.path.isdir(data["fdir"]): | |
os.makedirs(data["fdir"]) | |
with open(data["fpath"], "wb") as fh: | |
fh.write(hugo_text.encode('UTF-8')) | |
print "Created: %s/%s" % (str(ltime.tm_year), data["fname"]) | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) == 1: | |
print "Usage: python wp_to_hugo.py <wordpress XML file>" | |
sys.exit(1) | |
convert_wp_xml(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Used this today and it took a bit of modification to my XML (replacing a few dates that were year -0001 for some reason, and adding in a missing title (
<title></title>
became<title>?</title>
), but it otherwise works great! Thank you for posting your solution.