mehori/wp-xml2hugo.py

## wp-xml2hugo.py
import os
import re
import xml.etree.ElementTree as et
from markdownify import markdownify as md
from shutil import copyfile
import datetime
import email.utils
import pytz

# define namespace
ns = {  'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
        'content': 'http://purl.org/rss/1.0/modules/content/',
        'wfw':     'http://wellformedweb.org/CommentAPI/',
        'dc':      'http://purl.org/dc/elements/1.1/',
        'wp':      'http://wordpress.org/export/1.2/' }

# parse xml
tree = et.parse('all-media.xml')
root = tree.getroot()

# find item where wp:post_type is "attachment"
featured_img = {}
for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	# find and make a dict of all featured images
	if (type == 'attachment'):
		id  = i.find('.//wp:post_id',ns).text
		url = i.find('.//wp:attachment_url',ns).text
		featured_img[id] = url

# parse xml
tree = et.parse('all-posts.xml')
root = tree.getroot()
# find item where wp:post_type is "attachment"
for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	if (type == 'post'):
		id         = i.find('.//wp:post_id',ns).text
		title      = i.find('.//title',ns).text
		content    = i.find('.//content:encoded',ns).text
		pubdate    = i.find('.//pubDate',ns).text
		post_name  = i.find('.//wp:post_name',ns).text

		# find wp:postmeta with key _thumbnail_id, and get the url of featured image
		featured = ""
		for m in i.iterfind('./wp:postmeta',ns):
			meta_key = m.find('./wp:meta_key',ns)
			meta_val = m.find('./wp:meta_value',ns)

			if (meta_key.text == "_thumbnail_id"):
				featured = featured_img[meta_val.text]

		# convert time from RFC822 to RFC3339 iso format
		date = email.utils.parsedate_tz(str(pubdate))
		year = str(date[0])
		dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] )
		dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo"))
		pubdate = dt.isoformat(timespec="seconds")

		# get all category
		cat_list = []
		for c in i.iterfind('.//category[@domain="category"]',ns):
			cat_list.append(c.text)

		# get all tag
		tag_list = []
		for c in i.iterfind('.//category[@domain="post_tag"]',ns):
			tag_list.append(c.text)

		# mkdir: year
		odir = "./out/"
		if not os.path.isdir(odir+year):
			os.mkdir('./out/'+year)

		# mkdir each post
		if not os.path.isdir(odir+year+"/"+str(post_name)):
			os.mkdir('./out/'+year+"/"+str(post_name))

		with open('./out/'+ year + "/" + str(post_name)+"/index.md", 'w') as f:
			# print frontmatter
			print("---",file=f)
			print("title: \'" + title + "\'",file=f)
			print("date: " + pubdate,file=f)

			if featured:
				# copy featured file
				bn = os.path.basename(featured)
				s  = featured.split('/')
				src_file = 'uploads/'+s[5]+'/'+s[6]+'/'+bn
				dst_file = './out/'+ year + "/" + str(post_name) + "/" + bn
				if (os.path.exists(src_file)):
					copyfile(src_file,dst_file)
					print("featured_image: " + bn, file=f)
				else:
					print("no featured file:",src_file)

			if cat_list:
				print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]", file=f)
			if tag_list:
				print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]", file=f)
			print("draft: false", file=f)
			print("---\n", file=f)

			# remove comment, convert to Markdown
			content = re.sub("(<!--.*?-->)", "", content)
			content = md(content, heading_style="ATX")

			# find image within content
			match = re.findall(r'!\[.*?]\((?:http|https)://mehori\.com/wp-content/uploads/(.*?)\)',str(content))
			for m in match:
				src_file = 'uploads/'+m
				bn = os.path.basename(m)
				dst_file = './out/'+ year + "/" + str(post_name)+'/'+bn

				if (os.path.exists(src_file)):
					copyfile(src_file,dst_file)
				else:
					print("no file within post:",src_file," at ",post_name)

			# replace all image markdown url
			content = re.sub(r'!\[(.*?)]\((?:http|https)://mehori\.com/wp-content/uploads/.*/(.*?)\)','![\\1](\\2)',str(content))
			print(content,file=f)

		if 'featured' in locals():
			del featured
		if 'pubdate' in locals():
			del pubdate
	import os
	import re
	import xml.etree.ElementTree as et
	from markdownify import markdownify as md
	from shutil import copyfile
	import datetime
	import email.utils
	import pytz

	# define namespace
	ns = { 'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
	'content': 'http://purl.org/rss/1.0/modules/content/',
	'wfw': 'http://wellformedweb.org/CommentAPI/',
	'dc': 'http://purl.org/dc/elements/1.1/',
	'wp': 'http://wordpress.org/export/1.2/' }

	# parse xml
	tree = et.parse('all-media.xml')
	root = tree.getroot()

	# find item where wp:post_type is "attachment"
	featured_img = {}
	for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	# find and make a dict of all featured images
	if (type == 'attachment'):
	id = i.find('.//wp:post_id',ns).text
	url = i.find('.//wp:attachment_url',ns).text
	featured_img[id] = url

	# parse xml
	tree = et.parse('all-posts.xml')
	root = tree.getroot()
	# find item where wp:post_type is "attachment"
	for i in root.iterfind('./channel//item',ns):
	type = i.find('.//wp:post_type',ns).text

	if (type == 'post'):
	id = i.find('.//wp:post_id',ns).text
	title = i.find('.//title',ns).text
	content = i.find('.//content:encoded',ns).text
	pubdate = i.find('.//pubDate',ns).text
	post_name = i.find('.//wp:post_name',ns).text

	# find wp:postmeta with key _thumbnail_id, and get the url of featured image
	featured = ""
	for m in i.iterfind('./wp:postmeta',ns):
	meta_key = m.find('./wp:meta_key',ns)
	meta_val = m.find('./wp:meta_value',ns)

	if (meta_key.text == "_thumbnail_id"):
	featured = featured_img[meta_val.text]

	# convert time from RFC822 to RFC3339 iso format
	date = email.utils.parsedate_tz(str(pubdate))
	year = str(date[0])
	dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] )
	dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo"))
	pubdate = dt.isoformat(timespec="seconds")

	# get all category
	cat_list = []
	for c in i.iterfind('.//category[@domain="category"]',ns):
	cat_list.append(c.text)

	# get all tag
	tag_list = []
	for c in i.iterfind('.//category[@domain="post_tag"]',ns):
	tag_list.append(c.text)

	# mkdir: year
	odir = "./out/"
	if not os.path.isdir(odir+year):
	os.mkdir('./out/'+year)

	# mkdir each post
	if not os.path.isdir(odir+year+"/"+str(post_name)):
	os.mkdir('./out/'+year+"/"+str(post_name))

	with open('./out/'+ year + "/" + str(post_name)+"/index.md", 'w') as f:
	# print frontmatter
	print("---",file=f)
	print("title: \'" + title + "\'",file=f)
	print("date: " + pubdate,file=f)

	if featured:
	# copy featured file
	bn = os.path.basename(featured)
	s = featured.split('/')
	src_file = 'uploads/'+s[5]+'/'+s[6]+'/'+bn
	dst_file = './out/'+ year + "/" + str(post_name) + "/" + bn
	if (os.path.exists(src_file)):
	copyfile(src_file,dst_file)
	print("featured_image: " + bn, file=f)
	else:
	print("no featured file:",src_file)

	if cat_list:
	print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]", file=f)
	if tag_list:
	print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]", file=f)
	print("draft: false", file=f)
	print("---\n", file=f)

	# remove comment, convert to Markdown
	content = re.sub("(<!--.*?-->)", "", content)
	content = md(content, heading_style="ATX")

	# find image within content
	match = re.findall(r'!\[.?]\((?:http\|https)://mehori\.com/wp-content/uploads/(.?)\)',str(content))
	for m in match:
	src_file = 'uploads/'+m
	bn = os.path.basename(m)
	dst_file = './out/'+ year + "/" + str(post_name)+'/'+bn

	if (os.path.exists(src_file)):
	copyfile(src_file,dst_file)
	else:
	print("no file within post:",src_file," at ",post_name)

	# replace all image markdown url
	content = re.sub(r'!\[(.?)]\((?:http\|https)://mehori\.com/wp-content/uploads/./(.*?)\)','![\\1](\\2)',str(content))
	print(content,file=f)

	if 'featured' in locals():
	del featured
	if 'pubdate' in locals():
	del pubdate