Created
November 19, 2008 11:30
-
-
Save alx/26482 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sgmllib, re, time, string, os, sys, getopt, logging | |
from xml.dom import minidom | |
# | |
# Script to transform a wordpress export file into multiple textile files | |
# | |
# Usage: | |
# pyhton wordpress-export-to-textile.py -o ~/output-dir/ ~/path-to-file/wordpress-export.xml | |
# | |
# The script will crash on unexpected unicode chars | |
# you can clean it in the last post displayed by the logging debug. | |
# | |
# If you've got new ideas, edits are welcomed on this gist | |
# | |
__author__ = 'Luis Rei <luis.rei@gmail.com>' | |
__homepage__ = 'http://luisrei.com' | |
__version__ = '1.0' | |
__date__ = '2008/03/23' | |
# Log everything, and send it to stderr. | |
logging.basicConfig(level=logging.DEBUG) | |
whitespace_re = re.compile("\s+") | |
def normalise_space(s): | |
"""Normalise space in the same manner as HTML. Any substring of multiple | |
whitespace characters will be replaced with a single space char. | |
""" | |
return whitespace_re.sub(" ", str(s)) | |
def make_block_start_end_pair(tag): | |
def start_t(self, attrs): | |
self._write("%s. " % tag) | |
self._start_capture(tag) | |
def end_t(self): | |
self._stop_capture_and_write() | |
self._write("\n\n") | |
return start_t, end_t | |
def make_quicktag_start_end_pair(tag, wrapchar): | |
def start_t(self, attrs): | |
self._write([" ", wrapchar]) | |
self._start_capture(tag) | |
def end_t(self): | |
self._stop_capture_and_write() | |
self._write([wrapchar, " "]) | |
return start_t, end_t | |
class HtmlToTextileConvertingParser(sgmllib.SGMLParser): | |
"""An SGML parser class which traverses the tree and converts HTML tags into | |
Textile markup. Block tags within block tags are ignored. | |
""" | |
valid_tags = () | |
valid_attrs = () | |
block_tags = ("h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "bq") | |
from htmlentitydefs import entitydefs | |
def __init__(self): | |
sgmllib.SGMLParser.__init__(self) | |
self._result = [] | |
self._data_stack = [] | |
self._in_block = self._in_ul = self._in_ol = False | |
def handle_data(self, data): | |
if data: | |
self._write(normalise_space(data).strip()) | |
def handle_charref(self, tag): | |
self._write(unichr(int(tag))) | |
def handle_entityref(self, tag): | |
if self.entitydefs.has_key(tag): | |
self._write(self.entitydefs[tag]) | |
def handle_starttag(self, tag, method, attrs): | |
method(dict(attrs)) | |
def _write(self, d): | |
if len(self._data_stack) < 2: | |
target = self._result | |
else: | |
target = self._data_stack[-1] | |
if type(d) in (list, tuple): | |
target += d | |
else: | |
target.append(str(d)) | |
def _start_capture(self, tag): | |
self._in_block = tag | |
self._data_stack.append([]) | |
def _stop_capture_and_write(self): | |
self._in_block = False | |
self._write(self._data_stack.pop()) | |
start_h1, end_h1 = make_block_start_end_pair("h1") | |
start_h2, end_h2 = make_block_start_end_pair("h2") | |
start_h3, end_h3 = make_block_start_end_pair("h3") | |
start_h4, end_h4 = make_block_start_end_pair("h4") | |
start_h5, end_h5 = make_block_start_end_pair("h5") | |
start_h6, end_h6 = make_block_start_end_pair("h6") | |
start_h7, end_h7 = make_block_start_end_pair("h7") | |
start_p, end_p = make_block_start_end_pair("p") | |
start_blockquote, end_blockquote = make_block_start_end_pair("bq") | |
start_b, end_b = make_quicktag_start_end_pair("b", "*") | |
start_strong, end_strong = make_quicktag_start_end_pair("strong", "*") | |
start_i, end_i = make_quicktag_start_end_pair("i", "_") | |
start_em, end_em = make_quicktag_start_end_pair("em", "_") | |
start_cite, end_cite = make_quicktag_start_end_pair("cite", "??") | |
start_s, end_s = make_quicktag_start_end_pair("s", "-") | |
start_sup, end_sup = make_quicktag_start_end_pair("sup", "^") | |
start_sub, end_sub = make_quicktag_start_end_pair("sub", "~") | |
def start_p(self, attrs): | |
self._start_capture("p") | |
def end_p(self): | |
self._stop_capture_and_write() | |
self._write("\n\n") | |
def start_ol(self, attrs): | |
self._in_ol = True | |
def end_ol(self): | |
self._in_ol = False | |
self._write("\n") | |
def start_ul(self, attrs): | |
self._in_ul = True | |
def end_ul(self): | |
self._in_ul = False | |
self._write("\n") | |
def start_li(self, attrs): | |
if self._in_ol: | |
self._write("# ") | |
else: | |
self._write("* ") | |
self._start_capture("li") | |
def end_li(self): | |
self._stop_capture_and_write() | |
self._write("\n") | |
def start_a(self, attrs): | |
self.a_href = attrs.get("href") | |
if self.a_href: | |
self._write(" \"") | |
self._start_capture("a") | |
def end_a(self): | |
if self.a_href: | |
self._stop_capture_and_write() | |
self._write(["\":", self.a_href, " "]) | |
self.a_href = False | |
def start_img(self, attrs): | |
if attrs.get("src"): | |
self._write([" !", attrs["src"], "! "]) | |
def end_img(self): | |
pass | |
def start_tr(self, attrs): | |
pass | |
def end_tr(self): | |
self._write("|\n") | |
def start_td(self, attrs): | |
self._write("|") | |
self._start_capture("td") | |
def end_td(self): | |
self._stop_capture_and_write() | |
self._write("|") | |
def start_br(self, attrs): | |
self._write("\n") | |
def unknown_starttag(self, tag, attrs): | |
"""Delete all other tags except for those specified in valid_tags""" | |
if tag in self.valid_tags: | |
self._write(["<", tag]) | |
for k, v in attrs: | |
if k in self.valid_attrs: | |
self._write([" ", k, "=\"", v, "\""]) | |
self._write(">") | |
def unknown_endtag(self, tag): | |
if tag in self.valid_tags: | |
self._write(["</", tag, ">"]) | |
def _get_result(self): | |
return "".join(self._result).strip() | |
result = property(_get_result) | |
def html2textile(s): | |
"""Convert a snippet of HTML to Textile, a simple markup language. See | |
http://www.textism.com/tools/textile/ for Textile's rules. | |
>>> html2textile("<h1>Hello world!</h1>") | |
'h1. Hello world!' | |
>>> html2textile("<h1>Hello <strong>world</strong>!</h1>") | |
'h1. Hello *world*!' | |
>>> html2textile('<h1>Hello <a href="http://www.google.com/">world</a>!</h1>') | |
'h1. Hello "world":http://www.google.com/!' | |
>>> html2textile('<img src="http://www.google.com/intl/en/images/logo.gif" \ | |
... width="276" height="110" alt="Google logo">') | |
'!http://www.google.com/intl/en/images/logo.gif!' | |
>>> html2textile('<h1>Hello world!</h1><p>Welcome to my home page.</p>') | |
'h1. Hello world!\\n\\np. Welcome to my home page.' | |
""" | |
parser = HtmlToTextileConvertingParser() | |
parser.feed(s) | |
parser.close() | |
return parser.result | |
def convert(infile, outdir, authorDirs, categoryDirs): | |
"""Convert Wordpress Export File to multiple html files. | |
Keyword arguments: | |
infile -- the location of the Wordpress Export File | |
outdir -- the directory where the files will be created | |
authorDirs -- if true, create different directories for each author | |
categoryDirs -- if true, create directories for each category | |
""" | |
# First we parse the XML file into a list of posts. | |
# Each post is a dictionary | |
dom = minidom.parse(infile) | |
blog = [] # list that will contain all posts | |
for node in dom.getElementsByTagName('item'): | |
post = dict() | |
post["title"] = node.getElementsByTagName('title')[0].firstChild.data | |
post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data | |
post["author"] = node.getElementsByTagName( | |
'dc:creator')[0].firstChild.data | |
post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data | |
if node.getElementsByTagName('content:encoded')[0].firstChild != None: | |
post["text"] = node.getElementsByTagName( | |
'content:encoded')[0].firstChild.data | |
else: | |
post["text"] = "" | |
# wp:attachment_url could be use to download attachments | |
# Get the categories | |
tempCategories = [] | |
for subnode in node.getElementsByTagName('category'): | |
tempCategories.append(subnode.getAttribute('nicename')) | |
categories = [x for x in tempCategories if x != ''] | |
post["categories"] = categories | |
# Add post to the list of all posts | |
blog.append(post) | |
# Then we create the directories and HTML files from the list of posts. | |
# The "base" directory | |
outdir += "/wordpress/" | |
if os.path.exists(outdir) == False: | |
os.makedirs(outdir) | |
os.chdir(outdir) | |
for post in blog: | |
# The "category" directories | |
path = "" | |
if authorDirs == True: | |
path += post["author"].encode('utf-8') + "/" | |
# This creates a path for the file in the format | |
# category1/category2/category3/file. Note that the category list was | |
# sorted. | |
if categoryDirs == True: | |
if (post["categories"] != None): | |
path += string.join(post["categories"],"/") | |
if os.path.exists(path) == False and path != "": | |
os.makedirs(path) | |
date = time.strptime(post["date"],"%a, %d %b %Y %H:%M:%S +0000") | |
file_date = time.strftime("%Y-%m-%d", date) | |
post_date = time.strftime("%d %b %Y", date) | |
# And finally the file itself | |
path = outdir + path | |
title = post["title"].encode('utf-8') | |
logging.debug("title: " + title) | |
filename = path + "/" + file_date + ' - ' + title.replace(" ", "-").lower() + '.textile' | |
f = open(filename, 'w') | |
# Add "HTML header" | |
start = "---\ntemplate: post\ntitle: "+ title +"\n---\n\nh1. {{ page.title }}\n\np(meta). " + post_date + "\n\n" | |
f.write(start) | |
# Convert the unicode object to a string that can be written to a file | |
# with the proper encoding (UTF-8) | |
text = post["text"].encode('utf-8') | |
text = text.replace("\n", "\n\n") | |
f.write(html2textile(text)) | |
f.close() | |
def usage(pname): | |
"""Displays usage information | |
keyword arguments: | |
pname -- program name (e.g. obtained as argv[0]) | |
""" | |
print """python %s [-hac] [-o outdir] infile | |
Converts a Wordpress Export File to multiple html files. | |
Options: | |
-h,--help\tDisplays this information. | |
-a,--authors\tCreate different directories for each author. | |
-c,--categories\tCreate directory structure from post categories. | |
-o,--outdir\tSpecify a directory for the output. | |
Example: | |
python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml | |
""" % (pname, pname) | |
def main(argv): | |
outdir = "" | |
authors = False | |
categories = False | |
try: | |
opts, args = getopt.getopt( | |
argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"]) | |
except getopt.GetoptError, err: | |
print str(err) | |
usage(argv[0]) | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt in ("-h", "--help"): | |
usage(argv[0]) | |
sys.exit() | |
elif opt in ("-a", "--authors"): | |
authors = True | |
elif opt in ("-c", "--categories"): | |
categories = True | |
elif opt in ("-o", "--outdir"): | |
outdir = arg | |
infile = "".join(args) | |
if infile == "": | |
print "Error: Missing Argument: missing wordpress export file." | |
usage(argv[0]) | |
sys.exit(3) | |
if outdir == "": | |
# Use the current directory | |
outdir = os.getcwd() | |
convert(infile, outdir, authors, categories) | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment