Skip to content

Instantly share code, notes, and snippets.

@alx
Created November 19, 2008 11:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alx/26482 to your computer and use it in GitHub Desktop.
Save alx/26482 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sgmllib, re, time, string, os, sys, getopt, logging
from xml.dom import minidom
#
# Script to transform a wordpress export file into multiple textile files
#
# Usage:
# pyhton wordpress-export-to-textile.py -o ~/output-dir/ ~/path-to-file/wordpress-export.xml
#
# The script will crash on unexpected unicode chars
# you can clean it in the last post displayed by the logging debug.
#
# If you've got new ideas, edits are welcomed on this gist
#
__author__ = 'Luis Rei <luis.rei@gmail.com>'
__homepage__ = 'http://luisrei.com'
__version__ = '1.0'
__date__ = '2008/03/23'
# Log everything, and send it to stderr.
logging.basicConfig(level=logging.DEBUG)
whitespace_re = re.compile("\s+")
def normalise_space(s):
"""Normalise space in the same manner as HTML. Any substring of multiple
whitespace characters will be replaced with a single space char.
"""
return whitespace_re.sub(" ", str(s))
def make_block_start_end_pair(tag):
def start_t(self, attrs):
self._write("%s. " % tag)
self._start_capture(tag)
def end_t(self):
self._stop_capture_and_write()
self._write("\n\n")
return start_t, end_t
def make_quicktag_start_end_pair(tag, wrapchar):
def start_t(self, attrs):
self._write([" ", wrapchar])
self._start_capture(tag)
def end_t(self):
self._stop_capture_and_write()
self._write([wrapchar, " "])
return start_t, end_t
class HtmlToTextileConvertingParser(sgmllib.SGMLParser):
"""An SGML parser class which traverses the tree and converts HTML tags into
Textile markup. Block tags within block tags are ignored.
"""
valid_tags = ()
valid_attrs = ()
block_tags = ("h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "bq")
from htmlentitydefs import entitydefs
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self._result = []
self._data_stack = []
self._in_block = self._in_ul = self._in_ol = False
def handle_data(self, data):
if data:
self._write(normalise_space(data).strip())
def handle_charref(self, tag):
self._write(unichr(int(tag)))
def handle_entityref(self, tag):
if self.entitydefs.has_key(tag):
self._write(self.entitydefs[tag])
def handle_starttag(self, tag, method, attrs):
method(dict(attrs))
def _write(self, d):
if len(self._data_stack) < 2:
target = self._result
else:
target = self._data_stack[-1]
if type(d) in (list, tuple):
target += d
else:
target.append(str(d))
def _start_capture(self, tag):
self._in_block = tag
self._data_stack.append([])
def _stop_capture_and_write(self):
self._in_block = False
self._write(self._data_stack.pop())
start_h1, end_h1 = make_block_start_end_pair("h1")
start_h2, end_h2 = make_block_start_end_pair("h2")
start_h3, end_h3 = make_block_start_end_pair("h3")
start_h4, end_h4 = make_block_start_end_pair("h4")
start_h5, end_h5 = make_block_start_end_pair("h5")
start_h6, end_h6 = make_block_start_end_pair("h6")
start_h7, end_h7 = make_block_start_end_pair("h7")
start_p, end_p = make_block_start_end_pair("p")
start_blockquote, end_blockquote = make_block_start_end_pair("bq")
start_b, end_b = make_quicktag_start_end_pair("b", "*")
start_strong, end_strong = make_quicktag_start_end_pair("strong", "*")
start_i, end_i = make_quicktag_start_end_pair("i", "_")
start_em, end_em = make_quicktag_start_end_pair("em", "_")
start_cite, end_cite = make_quicktag_start_end_pair("cite", "??")
start_s, end_s = make_quicktag_start_end_pair("s", "-")
start_sup, end_sup = make_quicktag_start_end_pair("sup", "^")
start_sub, end_sub = make_quicktag_start_end_pair("sub", "~")
def start_p(self, attrs):
self._start_capture("p")
def end_p(self):
self._stop_capture_and_write()
self._write("\n\n")
def start_ol(self, attrs):
self._in_ol = True
def end_ol(self):
self._in_ol = False
self._write("\n")
def start_ul(self, attrs):
self._in_ul = True
def end_ul(self):
self._in_ul = False
self._write("\n")
def start_li(self, attrs):
if self._in_ol:
self._write("# ")
else:
self._write("* ")
self._start_capture("li")
def end_li(self):
self._stop_capture_and_write()
self._write("\n")
def start_a(self, attrs):
self.a_href = attrs.get("href")
if self.a_href:
self._write(" \"")
self._start_capture("a")
def end_a(self):
if self.a_href:
self._stop_capture_and_write()
self._write(["\":", self.a_href, " "])
self.a_href = False
def start_img(self, attrs):
if attrs.get("src"):
self._write([" !", attrs["src"], "! "])
def end_img(self):
pass
def start_tr(self, attrs):
pass
def end_tr(self):
self._write("|\n")
def start_td(self, attrs):
self._write("|")
self._start_capture("td")
def end_td(self):
self._stop_capture_and_write()
self._write("|")
def start_br(self, attrs):
self._write("\n")
def unknown_starttag(self, tag, attrs):
"""Delete all other tags except for those specified in valid_tags"""
if tag in self.valid_tags:
self._write(["<", tag])
for k, v in attrs:
if k in self.valid_attrs:
self._write([" ", k, "=\"", v, "\""])
self._write(">")
def unknown_endtag(self, tag):
if tag in self.valid_tags:
self._write(["</", tag, ">"])
def _get_result(self):
return "".join(self._result).strip()
result = property(_get_result)
def html2textile(s):
"""Convert a snippet of HTML to Textile, a simple markup language. See
http://www.textism.com/tools/textile/ for Textile's rules.
>>> html2textile("<h1>Hello world!</h1>")
'h1. Hello world!'
>>> html2textile("<h1>Hello <strong>world</strong>!</h1>")
'h1. Hello *world*!'
>>> html2textile('<h1>Hello <a href="http://www.google.com/">world</a>!</h1>')
'h1. Hello "world":http://www.google.com/!'
>>> html2textile('<img src="http://www.google.com/intl/en/images/logo.gif" \
... width="276" height="110" alt="Google logo">')
'!http://www.google.com/intl/en/images/logo.gif!'
>>> html2textile('<h1>Hello world!</h1><p>Welcome to my home page.</p>')
'h1. Hello world!\\n\\np. Welcome to my home page.'
"""
parser = HtmlToTextileConvertingParser()
parser.feed(s)
parser.close()
return parser.result
def convert(infile, outdir, authorDirs, categoryDirs):
"""Convert Wordpress Export File to multiple html files.
Keyword arguments:
infile -- the location of the Wordpress Export File
outdir -- the directory where the files will be created
authorDirs -- if true, create different directories for each author
categoryDirs -- if true, create directories for each category
"""
# First we parse the XML file into a list of posts.
# Each post is a dictionary
dom = minidom.parse(infile)
blog = [] # list that will contain all posts
for node in dom.getElementsByTagName('item'):
post = dict()
post["title"] = node.getElementsByTagName('title')[0].firstChild.data
post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
post["author"] = node.getElementsByTagName(
'dc:creator')[0].firstChild.data
post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data
if node.getElementsByTagName('content:encoded')[0].firstChild != None:
post["text"] = node.getElementsByTagName(
'content:encoded')[0].firstChild.data
else:
post["text"] = ""
# wp:attachment_url could be use to download attachments
# Get the categories
tempCategories = []
for subnode in node.getElementsByTagName('category'):
tempCategories.append(subnode.getAttribute('nicename'))
categories = [x for x in tempCategories if x != '']
post["categories"] = categories
# Add post to the list of all posts
blog.append(post)
# Then we create the directories and HTML files from the list of posts.
# The "base" directory
outdir += "/wordpress/"
if os.path.exists(outdir) == False:
os.makedirs(outdir)
os.chdir(outdir)
for post in blog:
# The "category" directories
path = ""
if authorDirs == True:
path += post["author"].encode('utf-8') + "/"
# This creates a path for the file in the format
# category1/category2/category3/file. Note that the category list was
# sorted.
if categoryDirs == True:
if (post["categories"] != None):
path += string.join(post["categories"],"/")
if os.path.exists(path) == False and path != "":
os.makedirs(path)
date = time.strptime(post["date"],"%a, %d %b %Y %H:%M:%S +0000")
file_date = time.strftime("%Y-%m-%d", date)
post_date = time.strftime("%d %b %Y", date)
# And finally the file itself
path = outdir + path
title = post["title"].encode('utf-8')
logging.debug("title: " + title)
filename = path + "/" + file_date + ' - ' + title.replace(" ", "-").lower() + '.textile'
f = open(filename, 'w')
# Add "HTML header"
start = "---\ntemplate: post\ntitle: "+ title +"\n---\n\nh1. {{ page.title }}\n\np(meta). " + post_date + "\n\n"
f.write(start)
# Convert the unicode object to a string that can be written to a file
# with the proper encoding (UTF-8)
text = post["text"].encode('utf-8')
text = text.replace("\n", "\n\n")
f.write(html2textile(text))
f.close()
def usage(pname):
"""Displays usage information
keyword arguments:
pname -- program name (e.g. obtained as argv[0])
"""
print """python %s [-hac] [-o outdir] infile
Converts a Wordpress Export File to multiple html files.
Options:
-h,--help\tDisplays this information.
-a,--authors\tCreate different directories for each author.
-c,--categories\tCreate directory structure from post categories.
-o,--outdir\tSpecify a directory for the output.
Example:
python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml
""" % (pname, pname)
def main(argv):
outdir = ""
authors = False
categories = False
try:
opts, args = getopt.getopt(
argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"])
except getopt.GetoptError, err:
print str(err)
usage(argv[0])
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage(argv[0])
sys.exit()
elif opt in ("-a", "--authors"):
authors = True
elif opt in ("-c", "--categories"):
categories = True
elif opt in ("-o", "--outdir"):
outdir = arg
infile = "".join(args)
if infile == "":
print "Error: Missing Argument: missing wordpress export file."
usage(argv[0])
sys.exit(3)
if outdir == "":
# Use the current directory
outdir = os.getcwd()
convert(infile, outdir, authors, categories)
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment