Skip to content

Instantly share code, notes, and snippets.

@mehori
Created April 29, 2020 13:58
Show Gist options
  • Save mehori/a9f9d15a80e14fa6f01514ec825765ea to your computer and use it in GitHub Desktop.
Save mehori/a9f9d15a80e14fa6f01514ec825765ea to your computer and use it in GitHub Desktop.
import os
import re
import xml.etree.ElementTree as et
from markdownify import markdownify as md
from shutil import copyfile
import datetime
import email.utils
import pytz
# define namespace
ns = { 'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'wfw': 'http://wellformedweb.org/CommentAPI/',
'dc': 'http://purl.org/dc/elements/1.1/',
'wp': 'http://wordpress.org/export/1.2/' }
# parse xml
tree = et.parse('all-media.xml')
root = tree.getroot()
# find item where wp:post_type is "attachment"
featured_img = {}
for i in root.iterfind('./channel//item',ns):
type = i.find('.//wp:post_type',ns).text
# find and make a dict of all featured images
if (type == 'attachment'):
id = i.find('.//wp:post_id',ns).text
url = i.find('.//wp:attachment_url',ns).text
featured_img[id] = url
# parse xml
tree = et.parse('all-posts.xml')
root = tree.getroot()
# find item where wp:post_type is "attachment"
for i in root.iterfind('./channel//item',ns):
type = i.find('.//wp:post_type',ns).text
if (type == 'post'):
id = i.find('.//wp:post_id',ns).text
title = i.find('.//title',ns).text
content = i.find('.//content:encoded',ns).text
pubdate = i.find('.//pubDate',ns).text
post_name = i.find('.//wp:post_name',ns).text
# find wp:postmeta with key _thumbnail_id, and get the url of featured image
featured = ""
for m in i.iterfind('./wp:postmeta',ns):
meta_key = m.find('./wp:meta_key',ns)
meta_val = m.find('./wp:meta_value',ns)
if (meta_key.text == "_thumbnail_id"):
featured = featured_img[meta_val.text]
# convert time from RFC822 to RFC3339 iso format
date = email.utils.parsedate_tz(str(pubdate))
year = str(date[0])
dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] )
dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo"))
pubdate = dt.isoformat(timespec="seconds")
# get all category
cat_list = []
for c in i.iterfind('.//category[@domain="category"]',ns):
cat_list.append(c.text)
# get all tag
tag_list = []
for c in i.iterfind('.//category[@domain="post_tag"]',ns):
tag_list.append(c.text)
# mkdir: year
odir = "./out/"
if not os.path.isdir(odir+year):
os.mkdir('./out/'+year)
# mkdir each post
if not os.path.isdir(odir+year+"/"+str(post_name)):
os.mkdir('./out/'+year+"/"+str(post_name))
with open('./out/'+ year + "/" + str(post_name)+"/index.md", 'w') as f:
# print frontmatter
print("---",file=f)
print("title: \'" + title + "\'",file=f)
print("date: " + pubdate,file=f)
if featured:
# copy featured file
bn = os.path.basename(featured)
s = featured.split('/')
src_file = 'uploads/'+s[5]+'/'+s[6]+'/'+bn
dst_file = './out/'+ year + "/" + str(post_name) + "/" + bn
if (os.path.exists(src_file)):
copyfile(src_file,dst_file)
print("featured_image: " + bn, file=f)
else:
print("no featured file:",src_file)
if cat_list:
print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]", file=f)
if tag_list:
print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]", file=f)
print("draft: false", file=f)
print("---\n", file=f)
# remove comment, convert to Markdown
content = re.sub("(<!--.*?-->)", "", content)
content = md(content, heading_style="ATX")
# find image within content
match = re.findall(r'!\[.*?]\((?:http|https)://mehori\.com/wp-content/uploads/(.*?)\)',str(content))
for m in match:
src_file = 'uploads/'+m
bn = os.path.basename(m)
dst_file = './out/'+ year + "/" + str(post_name)+'/'+bn
if (os.path.exists(src_file)):
copyfile(src_file,dst_file)
else:
print("no file within post:",src_file," at ",post_name)
# replace all image markdown url
content = re.sub(r'!\[(.*?)]\((?:http|https)://mehori\.com/wp-content/uploads/.*/(.*?)\)','![\\1](\\2)',str(content))
print(content,file=f)
if 'featured' in locals():
del featured
if 'pubdate' in locals():
del pubdate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment