Created
April 26, 2020 04:10
-
-
Save mehori/b347622d68af965d4d4aa17c8638039e to your computer and use it in GitHub Desktop.
Converting WordPress XML to Hugo Markdown: step1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import xml.etree.ElementTree as et | |
from markdownify import markdownify as md | |
from shutil import copyfile | |
import datetime | |
import email.utils | |
import pytz | |
# define namespace | |
ns = { 'excerpt': 'http://wordpress.org/export/1.2/excerpt/', | |
'content': 'http://purl.org/rss/1.0/modules/content/', | |
'wfw': 'http://wellformedweb.org/CommentAPI/', | |
'dc': 'http://purl.org/dc/elements/1.1/', | |
'wp': 'http://wordpress.org/export/1.2/' } | |
# parse xml | |
tree = et.parse('all-media.xml') | |
root = tree.getroot() | |
# find item where wp:post_type is "attachment" | |
featured_img = {} | |
for i in root.iterfind('./channel//item',ns): | |
type = i.find('.//wp:post_type',ns).text | |
# find and make a dict of all featured images | |
if (type == 'attachment'): | |
id = i.find('.//wp:post_id',ns).text | |
url = i.find('.//wp:attachment_url',ns).text | |
featured_img[id] = url | |
# parse xml | |
tree = et.parse('all-posts.xml') | |
root = tree.getroot() | |
# find item where wp:post_type is "attachment" | |
for i in root.iterfind('./channel//item',ns): | |
type = i.find('.//wp:post_type',ns).text | |
if (type == 'post'): | |
id = i.find('.//wp:post_id',ns).text | |
title = i.find('.//title',ns).text | |
content = i.find('.//content:encoded',ns).text | |
pubdate = i.find('.//pubDate',ns).text | |
post_name = i.find('.//wp:post_name',ns).text | |
# find wp:postmeta with key _thumbnail_id, and get the url of featured image | |
featured = "" | |
for m in i.iterfind('./wp:postmeta',ns): | |
meta_key = m.find('./wp:meta_key',ns) | |
meta_val = m.find('./wp:meta_value',ns) | |
if (meta_key.text == "_thumbnail_id"): | |
featured = featured_img[meta_val.text] | |
# convert time from RFC822 to RFC3339 iso format | |
date = email.utils.parsedate_tz(str(pubdate)) | |
dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] ) | |
dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo")) | |
pubdate = dt.isoformat(timespec="seconds") | |
# get all category | |
cat_list = [] | |
for c in i.iterfind('.//category[@domain="category"]',ns): | |
cat_list.append(c.text) | |
# get all tag | |
tag_list = [] | |
for c in i.iterfind('.//category[@domain="post_tag"]',ns): | |
tag_list.append(c.text) | |
# print frontmatter | |
print("---") | |
print("title: \'" + title + "\'") | |
print("date: " + pubdate) | |
if featured: | |
featured = os.path.basename(featured) | |
print("featured_image: " + featured) | |
if cat_list: | |
print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]") | |
if tag_list: | |
print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]") | |
print("draft: false") | |
print("---\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment