mehori/wp-xml2hugo-step1.py

## wp-xml2hugo-step1.py
import os
import re
import xml.etree.ElementTree as et
from markdownify import markdownify as md
from shutil import copyfile
import datetime
import email.utils
import pytz

# define namespace
ns = {  'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
        'content': 'http://purl.org/rss/1.0/modules/content/',
        'wfw':     'http://wellformedweb.org/CommentAPI/',
        'dc':      'http://purl.org/dc/elements/1.1/',
        'wp':      'http://wordpress.org/export/1.2/' }

# parse xml
tree = et.parse('all-media.xml')
root = tree.getroot()

# find item where wp:post_type is "attachment"
featured_img = {}
for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	# find and make a dict of all featured images
	if (type == 'attachment'):
		id  = i.find('.//wp:post_id',ns).text
		url = i.find('.//wp:attachment_url',ns).text
		featured_img[id] = url

# parse xml
tree = et.parse('all-posts.xml')
root = tree.getroot()
# find item where wp:post_type is "attachment"
for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	if (type == 'post'):
		id         = i.find('.//wp:post_id',ns).text
		title      = i.find('.//title',ns).text
		content    = i.find('.//content:encoded',ns).text
		pubdate    = i.find('.//pubDate',ns).text
		post_name  = i.find('.//wp:post_name',ns).text

		# find wp:postmeta with key _thumbnail_id, and get the url of featured image
		featured = ""
		for m in i.iterfind('./wp:postmeta',ns):
			meta_key = m.find('./wp:meta_key',ns)
			meta_val = m.find('./wp:meta_value',ns)

			if (meta_key.text == "_thumbnail_id"):
				featured = featured_img[meta_val.text]


		# convert time from RFC822 to RFC3339 iso format
		date = email.utils.parsedate_tz(str(pubdate))
		dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] )
		dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo"))
		pubdate = dt.isoformat(timespec="seconds")

		# get all category
		cat_list = []
		for c in i.iterfind('.//category[@domain="category"]',ns):
			cat_list.append(c.text)

		# get all tag
		tag_list = []
		for c in i.iterfind('.//category[@domain="post_tag"]',ns):
			tag_list.append(c.text)

		# print frontmatter
		print("---")
		print("title: \'" + title + "\'")
		print("date: " + pubdate)
		if featured:
			featured = os.path.basename(featured)
			print("featured_image: " + featured)
		if cat_list:
			print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]")

		if tag_list:
			print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]")
		print("draft: false")
		print("---\n")
	import os
	import re
	import xml.etree.ElementTree as et
	from markdownify import markdownify as md
	from shutil import copyfile
	import datetime
	import email.utils
	import pytz

	# define namespace
	ns = { 'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
	'content': 'http://purl.org/rss/1.0/modules/content/',
	'wfw': 'http://wellformedweb.org/CommentAPI/',
	'dc': 'http://purl.org/dc/elements/1.1/',
	'wp': 'http://wordpress.org/export/1.2/' }

	# parse xml
	tree = et.parse('all-media.xml')
	root = tree.getroot()

	# find item where wp:post_type is "attachment"
	featured_img = {}
	for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	# find and make a dict of all featured images
	if (type == 'attachment'):
	id = i.find('.//wp:post_id',ns).text
	url = i.find('.//wp:attachment_url',ns).text
	featured_img[id] = url

	# parse xml
	tree = et.parse('all-posts.xml')
	root = tree.getroot()
	# find item where wp:post_type is "attachment"
	for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	if (type == 'post'):
	id = i.find('.//wp:post_id',ns).text
	title = i.find('.//title',ns).text
	content = i.find('.//content:encoded',ns).text
	pubdate = i.find('.//pubDate',ns).text
	post_name = i.find('.//wp:post_name',ns).text

	# find wp:postmeta with key _thumbnail_id, and get the url of featured image
	featured = ""
	for m in i.iterfind('./wp:postmeta',ns):
	meta_key = m.find('./wp:meta_key',ns)
	meta_val = m.find('./wp:meta_value',ns)

	if (meta_key.text == "_thumbnail_id"):
	featured = featured_img[meta_val.text]


	# convert time from RFC822 to RFC3339 iso format
	date = email.utils.parsedate_tz(str(pubdate))
	dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] )
	dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo"))
	pubdate = dt.isoformat(timespec="seconds")

	# get all category
	cat_list = []
	for c in i.iterfind('.//category[@domain="category"]',ns):
	cat_list.append(c.text)

	# get all tag
	tag_list = []
	for c in i.iterfind('.//category[@domain="post_tag"]',ns):
	tag_list.append(c.text)

	# print frontmatter
	print("---")
	print("title: \'" + title + "\'")
	print("date: " + pubdate)
	if featured:
	featured = os.path.basename(featured)
	print("featured_image: " + featured)
	if cat_list:
	print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]")

	if tag_list:
	print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]")
	print("draft: false")
	print("---\n")