Skip to content

Instantly share code, notes, and snippets.

@jeremygibbs
Forked from Reiot/wordpress2octopress.py
Created January 31, 2012 10:22
Show Gist options
  • Save jeremygibbs/1709782 to your computer and use it in GitHub Desktop.
Save jeremygibbs/1709782 to your computer and use it in GitHub Desktop.
WordPress XML ==> Second Crack Markdown
#!/usr/bin/env python
#######################################################
# File: wp2sc.py #
# Date: 31 January 2011 #
# Auth: Jeremy A. Gibbs #
# Depn: Requires BeautifulSoup and html2text #
# Desc: This script reads in a Wordpress .xml export #
# file and converts the contents into markdown #
# formatted files suitable to import into #
# Marco Arment's Second Crack static blogging #
# engine. #
# Orig: https://gist.github.com/1239373 (by Reiot) #
# Original was for wordpress to octopress #
# Fork: https://gist.github.com/1709782 #
# I rewrote/cleaned up script to read my own #
# WordPress structure and to output in the #
# specific format Second Crack requires #
#######################################################
import os, sys, urllib, codecs
from datetime import datetime
from BeautifulSoup import BeautifulStoneSoup
from html2text import html2text
#################
# Configuration #
#################
wpXML = "wordpress-xml/export.xml"
format = '%04d-%02d-%02d-%s.txt'
statFilter = [u'publish']
postFilter = [u'post', u'page']
metaFilter = [u'Link']
ctgyFilter = [u'Uncategorized']
tagsFilter = []
writeCats = False
writeTags = True
writeExtra = False
##############
# XML Parser #
##############
def parse_item(item):
# return unicode of a selection
def _(node):
if not node or not node.string:
return u''
u = unicode(node.string)
if u.startswith(u'<![CDATA['):
u = u[9:-3]
return u
# only grab posts and pages
postType = _(item.find("wp:post_type"))
if postType not in postFilter:
return
# only grab published articles
wp_status = _(item.find("wp:status"))
if wp_status not in statFilter:
return
# article title
title = _(item.find("title"))
tLen = len(title)
# article publication date
postDate = _(item.find("wp:post_date"))
postDate = datetime.strptime(postDate,"%Y-%m-%d %H:%M:%S")
# article slug name
slug = _(item.find("wp:post_name"))
slug = urllib.unquote(slug.encode('utf-8')).decode('utf-8')
assert isinstance(slug, unicode), 'slug should be unicode'
# markdown output file
outFile = u'%04d%02d%02d-%s.txt'%(postDate.year, postDate.month, postDate.day, slug)
outPath = os.path.join(u"source", u"_%ss"% postType)
if not os.access( outPath, os.F_OK ):
os.mkdir( outPath )
out = codecs.open( os.path.join(outPath, outFile), "w", encoding='utf-8')
# write title and separator
out.write(u'%s\n'%title)
out.write(u'%s\n'%('='*tLen))
# check for linked post (if stored as meta value)
has_meta = {}
for meta in item.findAll("wp:postmeta"):
key = _(meta.find("wp:meta_key"))
value = _(meta.find("wp:meta_value"))
if key in metaFilter:
has_meta[key] = value
if has_meta:
for key, value in has_meta.iteritems():
out.write(u'%s: %s\n'%(key, value))
out.write(u'Type: link\n')
# publication date
out.write(u'Published: %s\n\n'% postDate)
# categories
if (writeCats):
categories = []
for category in item.findAll("category",{"domain":"category"}):
categories.append(_(category))
categories = list(set([c for c in categories if c not in ctgyFilter]))
if categories:
out.write(u'Categories:')
for category in categories:
out.write(u' %s,'% category)
out.write(u'\n')
# tags
if (writeTags):
tags = []
for tag in item.findAll("category",{"domain":"tag"}):
tags.append(_(tag))
tags = list(set([t for t in tags if t not in tagsFilter]))
if tags:
out.write(u'Tags:')
for tag in tags:
out.write(u' %s,'% tag)
out.write(u'\n')
# extras
if (writeExtra):
# comment status
wp_comment_status = _(item.find("wp:comment_status"))
out.write(u'comments: %s\n'% ('true' if wp_comment_status == u'open' else 'false'))
# old permlink
link = _(item.find("link"))
out.write(u'link: %s\n'% link)
# article body in markdown format
body = _(item.find("content:encoded"))
body = html2text(body.strip(), None)
out.write(body)
out.close()
if __name__ == '__main__':
print 'Reading WordPress XML Export File'
xml = BeautifulStoneSoup(open(wpXML))
print 'Parsing WordPress XML Export File'
for item in xml.findAll("item"):
parse_item(item)
print 'Conversion Completed'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment