Skip to content

Instantly share code, notes, and snippets.

@trentrichardson
Created November 2, 2018 12:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trentrichardson/719177f60204e482c385eb273d3bc6cf to your computer and use it in GitHub Desktop.
Save trentrichardson/719177f60204e482c385eb273d3bc6cf to your computer and use it in GitHub Desktop.
Wordpress xml export to Pelican
import sys
import os
import io
import time
import datetime
import dateutil.parser
import re
# for xml parsing
from bs4 import BeautifulSoup
# for converting to markdown
import html2text
# for downloading attachments
import wget
# for converting various wordpress code tags to markdown
coderegex1 = re.compile(r'\[sourcecode language=\"[a-zA-Z0-9]*\"\](.*?)\[\/sourcecode\]', re.DOTALL)
coderegex2 = re.compile(r'\[code language=\"[a-zA-Z0-9]*\"\](.*?)\[\/code\]', re.DOTALL)
coderegex3 = re.compile(r'\[code lang=[a-zA-Z0-9]*\](.*?)\[\/code\]', re.DOTALL)
# for replacing http://www.example.com/wp-content/uploads/ with just /uploads/
uploadregex = re.compile(r'(https?\:\/\/[a-zA-Z0-9\-\.]+\/wp-content\/uploads\/)')
class Post:
def __init__(self, title, author, date, content, category, status, slug, tags):
self.title = title
self.author = author
self.date = dateutil.parser.parse(date)
self.content = content
self.category = category
self.status = status
self.slug = slug
self.tags = tags
def load_doc(filename):
print("> Loading document!")
doc = ""
with io.open(filename, 'r', encoding='UTF-8') as f:
doc = f.read()
return doc
def parse_doc(doc):
print("> Parsing document!")
posts = []
attachments = []
soup = BeautifulSoup(doc, 'html.parser')
for item in soup.find_all('item'):
if item.find('wp:post_type').string == "post":
# get the tags
tags = []
for tag in item.findAll('category', { 'domain': 'post_tag'}):
tags.append(tag['nicename'])
# fix urls to not use wp-content nor full url
content = uploadregex.sub('{attach}images/',item.find('content:encoded').string)
posts.append(Post(
item.find('title').string,
item.find('dc:creator').string,
item.find('wp:post_date').string,
content,
item.find('category', { 'domain': 'category'})['nicename'],
item.find('wp:status').string,
item.find('wp:post_name').string,
'; '.join(tags) ))
elif item.find('wp:post_type').string == "attachment":
attachments.append(item.guid.string)
return posts, attachments
def gen_markdown(post):
h = html2text.HTML2Text()
h.unicode_snob = 1
h.body_width = 0
h.dash_unordered_list = True
title = post.title.translate(str.maketrans({"\"": """, ":": ":"}))
body = post.content
header ="""Title: %s
Date: %s
Category: %s
Tags: %s
Slug: %s
Author: %s
"""%(title, post.date.strftime("%Y-%m-%d %H:%M:%S"), post.category, post.tags, post.slug, post.author)
body = re.sub(coderegex1, r"<pre>\1</pre>", body, re.U)
body = re.sub(coderegex2, r"<pre>\1</pre>", body, re.U)
body = re.sub(coderegex3, r"<pre>\1</pre>", body, re.U)
body = h.handle(body)
return header + body
def save_posts(output, posts):
print("> Saving posts!")
out = ""
directory = ""
for p in posts:
if p.status == "publish":
directory = output + "_posts/"
elif p.status == "draft":
directory = output + "_drafts/"
else:
directory = output + "_other/"
if not os.path.exists(directory):
os.makedirs(directory)
print("Saving", directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md")
with io.open(directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md", 'w', encoding='UTF-8') as f:
f.write(gen_markdown(p))
def download_attachments(output, attachments):
print("> Wget'ing attachments!")
# todo
def main():
output = "./"
if len(sys.argv) == 1:
print("Parameters: filename for wordpress .xml export file, optional output directory")
return
elif len(sys.argv) == 2:
filename = sys.argv[1]
elif len(sys.argv) == 3:
filename = sys.argv[1]
output = sys.argv[2]
doc = ""
posts = []
attachments = []
doc = load_doc(filename)
posts, attachments = parse_doc(doc)
save_posts(output, posts)
download_attachments(output, attachments)
if __name__ == '__main__':
main()
@trentrichardson
Copy link
Author

trentrichardson commented Nov 2, 2018

Originally from wordpress-to-markdown this makes a few changes for Pelican's markdown specs for posts. It also grabs tags. I had issues with Pelican's version not working with the latest Pandoc, so with a couple tweaks to this script it worked fine for my needs.

Setup:
I used pipenv, so I made a new directory and put this python file in it, then ran the following:

pipenv --three
pipenv install python-dateutil
pipenv install BeautifulSoup4
pipenv install html2text
pipenv install wget

Usage:

pipenv shell
python wp-to-md.py /path/to/wordpress.xml

It should generate a folder like _post that contains the new .md files. These will be copied to your pelican project's content directory. For images copy the contents of your wp-content/uploads directory to content/images (the folders directly in images should be the year/month named folders)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment