Skip to content

Instantly share code, notes, and snippets.

@mehori
Created April 26, 2020 04:10
Show Gist options
  • Save mehori/b347622d68af965d4d4aa17c8638039e to your computer and use it in GitHub Desktop.
Save mehori/b347622d68af965d4d4aa17c8638039e to your computer and use it in GitHub Desktop.
Converting WordPress XML to Hugo Markdown: step1
import os
import re
import xml.etree.ElementTree as et
from markdownify import markdownify as md
from shutil import copyfile
import datetime
import email.utils
import pytz
# define namespace
ns = { 'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'wfw': 'http://wellformedweb.org/CommentAPI/',
'dc': 'http://purl.org/dc/elements/1.1/',
'wp': 'http://wordpress.org/export/1.2/' }
# parse xml
tree = et.parse('all-media.xml')
root = tree.getroot()
# find item where wp:post_type is "attachment"
featured_img = {}
for i in root.iterfind('./channel//item',ns):
type = i.find('.//wp:post_type',ns).text
# find and make a dict of all featured images
if (type == 'attachment'):
id = i.find('.//wp:post_id',ns).text
url = i.find('.//wp:attachment_url',ns).text
featured_img[id] = url
# parse xml
tree = et.parse('all-posts.xml')
root = tree.getroot()
# find item where wp:post_type is "attachment"
for i in root.iterfind('./channel//item',ns):
type = i.find('.//wp:post_type',ns).text
if (type == 'post'):
id = i.find('.//wp:post_id',ns).text
title = i.find('.//title',ns).text
content = i.find('.//content:encoded',ns).text
pubdate = i.find('.//pubDate',ns).text
post_name = i.find('.//wp:post_name',ns).text
# find wp:postmeta with key _thumbnail_id, and get the url of featured image
featured = ""
for m in i.iterfind('./wp:postmeta',ns):
meta_key = m.find('./wp:meta_key',ns)
meta_val = m.find('./wp:meta_value',ns)
if (meta_key.text == "_thumbnail_id"):
featured = featured_img[meta_val.text]
# convert time from RFC822 to RFC3339 iso format
date = email.utils.parsedate_tz(str(pubdate))
dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] )
dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo"))
pubdate = dt.isoformat(timespec="seconds")
# get all category
cat_list = []
for c in i.iterfind('.//category[@domain="category"]',ns):
cat_list.append(c.text)
# get all tag
tag_list = []
for c in i.iterfind('.//category[@domain="post_tag"]',ns):
tag_list.append(c.text)
# print frontmatter
print("---")
print("title: \'" + title + "\'")
print("date: " + pubdate)
if featured:
featured = os.path.basename(featured)
print("featured_image: " + featured)
if cat_list:
print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]")
if tag_list:
print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]")
print("draft: false")
print("---\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment