Created
April 29, 2020 13:58
-
-
Save mehori/a9f9d15a80e14fa6f01514ec825765ea to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import xml.etree.ElementTree as et | |
from markdownify import markdownify as md | |
from shutil import copyfile | |
import datetime | |
import email.utils | |
import pytz | |
# define namespace | |
ns = { 'excerpt': 'http://wordpress.org/export/1.2/excerpt/', | |
'content': 'http://purl.org/rss/1.0/modules/content/', | |
'wfw': 'http://wellformedweb.org/CommentAPI/', | |
'dc': 'http://purl.org/dc/elements/1.1/', | |
'wp': 'http://wordpress.org/export/1.2/' } | |
# parse xml | |
tree = et.parse('all-media.xml') | |
root = tree.getroot() | |
# find item where wp:post_type is "attachment" | |
featured_img = {} | |
for i in root.iterfind('./channel//item',ns): | |
type = i.find('.//wp:post_type',ns).text | |
# find and make a dict of all featured images | |
if (type == 'attachment'): | |
id = i.find('.//wp:post_id',ns).text | |
url = i.find('.//wp:attachment_url',ns).text | |
featured_img[id] = url | |
# parse xml | |
tree = et.parse('all-posts.xml') | |
root = tree.getroot() | |
# find item where wp:post_type is "attachment" | |
for i in root.iterfind('./channel//item',ns): | |
type = i.find('.//wp:post_type',ns).text | |
if (type == 'post'): | |
id = i.find('.//wp:post_id',ns).text | |
title = i.find('.//title',ns).text | |
content = i.find('.//content:encoded',ns).text | |
pubdate = i.find('.//pubDate',ns).text | |
post_name = i.find('.//wp:post_name',ns).text | |
# find wp:postmeta with key _thumbnail_id, and get the url of featured image | |
featured = "" | |
for m in i.iterfind('./wp:postmeta',ns): | |
meta_key = m.find('./wp:meta_key',ns) | |
meta_val = m.find('./wp:meta_value',ns) | |
if (meta_key.text == "_thumbnail_id"): | |
featured = featured_img[meta_val.text] | |
# convert time from RFC822 to RFC3339 iso format | |
date = email.utils.parsedate_tz(str(pubdate)) | |
year = str(date[0]) | |
dt = datetime.datetime( date[0], date[1], date[2], date[3], date[4], date[5] ) | |
dt = pytz.utc.localize(dt).astimezone(pytz.timezone("Asia/Tokyo")) | |
pubdate = dt.isoformat(timespec="seconds") | |
# get all category | |
cat_list = [] | |
for c in i.iterfind('.//category[@domain="category"]',ns): | |
cat_list.append(c.text) | |
# get all tag | |
tag_list = [] | |
for c in i.iterfind('.//category[@domain="post_tag"]',ns): | |
tag_list.append(c.text) | |
# mkdir: year | |
odir = "./out/" | |
if not os.path.isdir(odir+year): | |
os.mkdir('./out/'+year) | |
# mkdir each post | |
if not os.path.isdir(odir+year+"/"+str(post_name)): | |
os.mkdir('./out/'+year+"/"+str(post_name)) | |
with open('./out/'+ year + "/" + str(post_name)+"/index.md", 'w') as f: | |
# print frontmatter | |
print("---",file=f) | |
print("title: \'" + title + "\'",file=f) | |
print("date: " + pubdate,file=f) | |
if featured: | |
# copy featured file | |
bn = os.path.basename(featured) | |
s = featured.split('/') | |
src_file = 'uploads/'+s[5]+'/'+s[6]+'/'+bn | |
dst_file = './out/'+ year + "/" + str(post_name) + "/" + bn | |
if (os.path.exists(src_file)): | |
copyfile(src_file,dst_file) | |
print("featured_image: " + bn, file=f) | |
else: | |
print("no featured file:",src_file) | |
if cat_list: | |
print("categories: ["+','.join('"{0}"'.format(x) for x in cat_list)+"]", file=f) | |
if tag_list: | |
print("tags: ["+','.join('"{0}"'.format(x) for x in tag_list)+"]", file=f) | |
print("draft: false", file=f) | |
print("---\n", file=f) | |
# remove comment, convert to Markdown | |
content = re.sub("(<!--.*?-->)", "", content) | |
content = md(content, heading_style="ATX") | |
# find image within content | |
match = re.findall(r'!\[.*?]\((?:http|https)://mehori\.com/wp-content/uploads/(.*?)\)',str(content)) | |
for m in match: | |
src_file = 'uploads/'+m | |
bn = os.path.basename(m) | |
dst_file = './out/'+ year + "/" + str(post_name)+'/'+bn | |
if (os.path.exists(src_file)): | |
copyfile(src_file,dst_file) | |
else: | |
print("no file within post:",src_file," at ",post_name) | |
# replace all image markdown url | |
content = re.sub(r'!\[(.*?)]\((?:http|https)://mehori\.com/wp-content/uploads/.*/(.*?)\)','![\\1](\\2)',str(content)) | |
print(content,file=f) | |
if 'featured' in locals(): | |
del featured | |
if 'pubdate' in locals(): | |
del pubdate | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment