mtik00/wp_to_hugo.py

## wp_to_hugo.py
#!/usr/bin/env python2.7
"""
This script is used to convert a WordPress XML dump to Hugo-formatted posts.

NOTE: The WP post data is kept as-is (probably HTML).  It is not converted to
Markdown.  This is to reduce the amount of "fixing" one has to do after the
data is converted (e.g. line endings, links, etc).  This is generally not an
issue since Markdown allows HTML.

The post Metadata is converted to TOML.

The posts are written as: <year>/<title>.md
    where <year> is the year the post was written, and <title> is the WP title
    with all non-word characters replaced with "-", and converted to lower case.
"""

# Imports ######################################################################
import os
import re
import time
import calendar
import xml.etree.ElementTree as ET
from distutils.version import LooseVersion

# Metadata #####################################################################
__author__ = "Timothy McFadden"
__creationDate__ = "07/24/2015"
__license__ = "MIT"
__version__ = "1.0.0dev"

# Globals ######################################################################
DEBUG = False
KNOWN_WP_VERSION = LooseVersion("4.2")


def wp_to_hugo_date(wp_date, tz_direction=-1):
    """Converts a UTC time string from the WordPress XML to a Hugo time string."""
    date = time.strptime(wp_date, "%a, %d %b %Y %H:%M:%S +0000")
    date = calendar.timegm(date)
    ltime = time.localtime(date)
    date = time.strftime("%Y-%m-%dT%H:%M:%S", ltime)

    date += "%+03i:00" % (((time.timezone / 3600) - (1 * ltime.tm_isdst)) * tz_direction)
    return ltime, date


def hugo_format(data):
    result = ["+++"]
    for heading in ["title", "date", "type"]:
        result.append('%s = "%s"' % (heading, data[heading]))

    result.append("tags = %s" % str(data["tags"]))
    result.append("+++")
    result.append("")
    result.append(data["body"])

    return "\n".join(result)


def wp_version_check(channel):
    match = re.search("\?v=([\d\.]+)", channel.find("generator").text)
    if not match:
        print "WARNING: Could not find WP version in your XML."
        print "...This script may not work"
        raw_input("...press Enter to continue: ")
    else:
        wp_version = LooseVersion(match.group(1))
        if wp_version < KNOWN_WP_VERSION:
            print "WARNING: WP version in your XML (%s) is less than known good version (%s)!" % (wp_version, KNOWN_WP_VERSION)
            print "...This script may not work"
            raw_input("...press Enter to continue: ")


def convert_wp_xml(xml_path):
    tree = ET.parse(xml_path)

    # FYI: xml.etree doesn't support reading the namespaces, and I don't feel
    # like requiring lxml.
    nsmap = {
        "excerpt": "http://wordpress.org/export/1.2/excerpt/",
        "content": "http://purl.org/rss/1.0/modules/content/",
        "wfw": "http://wellformedweb.org/CommentAPI/",
        "dc": "http://purl.org/dc/elements/1.1/",
        "wp": "http://wordpress.org/export/1.2/",
    }

    channel = tree.find("channel")
    wp_version_check(channel)

    for item in channel.findall("item"):
        data = {
            "tags": [],
            "title": item.find("title").text,
            "date": None,
            "body": None,
            "fpath": None,
            "type": "post"
        }

        ltime, date = wp_to_hugo_date(item.find("pubDate").text)
        data["date"] = date
        data["tags"] = [x.attrib["nicename"] for x in item.findall("category")]
        data["body"] = item.find("content:encoded", nsmap).text

        fname = re.sub("\W+", "-", data["title"])
        fname = re.sub("(-+)$", "", fname)
        data["fname"] = "%s.md" % fname.lower()
        data["fdir"] = os.path.abspath(os.path.join(".", str(ltime.tm_year)))
        data["fpath"] = os.path.join(data["fdir"], data["fname"])

        hugo_text = hugo_format(data)

        if not os.path.isdir(data["fdir"]):
            os.makedirs(data["fdir"])

        with open(data["fpath"], "wb") as fh:
            fh.write(hugo_text.encode('UTF-8'))

        print "Created: %s/%s" % (str(ltime.tm_year), data["fname"])


if __name__ == '__main__':
    import sys

    if len(sys.argv) == 1:
        print "Usage:  python wp_to_hugo.py <wordpress XML file>"
        sys.exit(1)

    convert_wp_xml(sys.argv[1])
	#!/usr/bin/env python2.7
	"""
	This script is used to convert a WordPress XML dump to Hugo-formatted posts.

	NOTE: The WP post data is kept as-is (probably HTML). It is not converted to
	Markdown. This is to reduce the amount of "fixing" one has to do after the
	data is converted (e.g. line endings, links, etc). This is generally not an
	issue since Markdown allows HTML.

	The post Metadata is converted to TOML.

	The posts are written as: <year>/<title>.md
	where <year> is the year the post was written, and <title> is the WP title
	with all non-word characters replaced with "-", and converted to lower case.
	"""

	# Imports ######################################################################
	import os
	import re
	import time
	import calendar
	import xml.etree.ElementTree as ET
	from distutils.version import LooseVersion

	# Metadata #####################################################################
	__author__ = "Timothy McFadden"
	__creationDate__ = "07/24/2015"
	__license__ = "MIT"
	__version__ = "1.0.0dev"

	# Globals ######################################################################
	DEBUG = False
	KNOWN_WP_VERSION = LooseVersion("4.2")


	def wp_to_hugo_date(wp_date, tz_direction=-1):
	"""Converts a UTC time string from the WordPress XML to a Hugo time string."""
	date = time.strptime(wp_date, "%a, %d %b %Y %H:%M:%S +0000")
	date = calendar.timegm(date)
	ltime = time.localtime(date)
	date = time.strftime("%Y-%m-%dT%H:%M:%S", ltime)

	date += "%+03i:00" % (((time.timezone / 3600) - (1 * ltime.tm_isdst)) * tz_direction)
	return ltime, date


	def hugo_format(data):
	result = ["+++"]
	for heading in ["title", "date", "type"]:
	result.append('%s = "%s"' % (heading, data[heading]))

	result.append("tags = %s" % str(data["tags"]))
	result.append("+++")
	result.append("")
	result.append(data["body"])

	return "\n".join(result)


	def wp_version_check(channel):
	match = re.search("\?v=([\d\.]+)", channel.find("generator").text)
	if not match:
	print "WARNING: Could not find WP version in your XML."
	print "...This script may not work"
	raw_input("...press Enter to continue: ")
	else:
	wp_version = LooseVersion(match.group(1))
	if wp_version < KNOWN_WP_VERSION:
	print "WARNING: WP version in your XML (%s) is less than known good version (%s)!" % (wp_version, KNOWN_WP_VERSION)
	print "...This script may not work"
	raw_input("...press Enter to continue: ")


	def convert_wp_xml(xml_path):
	tree = ET.parse(xml_path)

	# FYI: xml.etree doesn't support reading the namespaces, and I don't feel
	# like requiring lxml.
	nsmap = {
	"excerpt": "http://wordpress.org/export/1.2/excerpt/",
	"content": "http://purl.org/rss/1.0/modules/content/",
	"wfw": "http://wellformedweb.org/CommentAPI/",
	"dc": "http://purl.org/dc/elements/1.1/",
	"wp": "http://wordpress.org/export/1.2/",
	}

	channel = tree.find("channel")
	wp_version_check(channel)

	for item in channel.findall("item"):
	data = {
	"tags": [],
	"title": item.find("title").text,
	"date": None,
	"body": None,
	"fpath": None,
	"type": "post"
	}

	ltime, date = wp_to_hugo_date(item.find("pubDate").text)
	data["date"] = date
	data["tags"] = [x.attrib["nicename"] for x in item.findall("category")]
	data["body"] = item.find("content:encoded", nsmap).text

	fname = re.sub("\W+", "-", data["title"])
	fname = re.sub("(-+)$", "", fname)
	data["fname"] = "%s.md" % fname.lower()
	data["fdir"] = os.path.abspath(os.path.join(".", str(ltime.tm_year)))
	data["fpath"] = os.path.join(data["fdir"], data["fname"])

	hugo_text = hugo_format(data)

	if not os.path.isdir(data["fdir"]):
	os.makedirs(data["fdir"])

	with open(data["fpath"], "wb") as fh:
	fh.write(hugo_text.encode('UTF-8'))

	print "Created: %s/%s" % (str(ltime.tm_year), data["fname"])


	if __name__ == '__main__':
	import sys

	if len(sys.argv) == 1:
	print "Usage: python wp_to_hugo.py <wordpress XML file>"
	sys.exit(1)

	convert_wp_xml(sys.argv[1])