alx/wordpress-export-to-textile.py

## wordpress-export-to-textile.py
#!/usr/bin/env python

import sgmllib, re, time, string, os, sys, getopt, logging
from xml.dom import minidom

#
# Script to transform a wordpress export file into multiple textile files
#
# Usage:
# pyhton wordpress-export-to-textile.py -o ~/output-dir/ ~/path-to-file/wordpress-export.xml
#
# The script will crash on unexpected unicode chars
# you can clean it in the last post displayed by the logging debug.
#
# If you've got new ideas, edits are welcomed on this gist
#


__author__ = 'Luis Rei <luis.rei@gmail.com>'
__homepage__ = 'http://luisrei.com'
__version__ = '1.0'
__date__ = '2008/03/23'

# Log everything, and send it to stderr.
logging.basicConfig(level=logging.DEBUG)

whitespace_re = re.compile("\s+")


def normalise_space(s):
    """Normalise space in the same manner as HTML. Any substring of multiple
    whitespace characters will be replaced with a single space char.

    """
    return whitespace_re.sub(" ", str(s))


def make_block_start_end_pair(tag):
    def start_t(self, attrs):
        self._write("%s. " % tag)
        self._start_capture(tag)
    def end_t(self):
        self._stop_capture_and_write()
        self._write("\n\n")
    return start_t, end_t


def make_quicktag_start_end_pair(tag, wrapchar):
    def start_t(self, attrs):
        self._write([" ", wrapchar])
        self._start_capture(tag)
    def end_t(self):
        self._stop_capture_and_write()
        self._write([wrapchar, " "])
    return start_t, end_t


class HtmlToTextileConvertingParser(sgmllib.SGMLParser):
    """An SGML parser class which traverses the tree and converts HTML tags into
    Textile markup. Block tags within block tags are ignored.

    """
    valid_tags = ()
    valid_attrs = ()
    block_tags = ("h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "bq")

    from htmlentitydefs import entitydefs

    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self._result = []
        self._data_stack = []
        self._in_block = self._in_ul = self._in_ol = False

    def handle_data(self, data):
        if data:
            self._write(normalise_space(data).strip())

    def handle_charref(self, tag):
        self._write(unichr(int(tag)))

    def handle_entityref(self, tag):
        if self.entitydefs.has_key(tag):
            self._write(self.entitydefs[tag])

    def handle_starttag(self, tag, method, attrs):
        method(dict(attrs))

    def _write(self, d):
        if len(self._data_stack) < 2:
            target = self._result
        else:
            target = self._data_stack[-1]
        if type(d) in (list, tuple):
            target += d
        else:
            target.append(str(d))

    def _start_capture(self, tag):
        self._in_block = tag
        self._data_stack.append([])

    def _stop_capture_and_write(self):
        self._in_block = False
        self._write(self._data_stack.pop())

    start_h1, end_h1 = make_block_start_end_pair("h1")
    start_h2, end_h2 = make_block_start_end_pair("h2")
    start_h3, end_h3 = make_block_start_end_pair("h3")
    start_h4, end_h4 = make_block_start_end_pair("h4")
    start_h5, end_h5 = make_block_start_end_pair("h5")
    start_h6, end_h6 = make_block_start_end_pair("h6")
    start_h7, end_h7 = make_block_start_end_pair("h7")
    start_p, end_p = make_block_start_end_pair("p")
    start_blockquote, end_blockquote = make_block_start_end_pair("bq")

    start_b, end_b = make_quicktag_start_end_pair("b", "*")
    start_strong, end_strong = make_quicktag_start_end_pair("strong", "*")
    start_i, end_i = make_quicktag_start_end_pair("i", "_")
    start_em, end_em = make_quicktag_start_end_pair("em", "_")
    start_cite, end_cite = make_quicktag_start_end_pair("cite", "??")
    start_s, end_s = make_quicktag_start_end_pair("s", "-")
    start_sup, end_sup = make_quicktag_start_end_pair("sup", "^")
    start_sub, end_sub = make_quicktag_start_end_pair("sub", "~")

    def start_p(self, attrs):
        self._start_capture("p")

    def end_p(self):
        self._stop_capture_and_write()
        self._write("\n\n")

    def start_ol(self, attrs):
        self._in_ol = True

    def end_ol(self):
        self._in_ol = False
        self._write("\n")

    def start_ul(self, attrs):
        self._in_ul = True

    def end_ul(self):
        self._in_ul = False
        self._write("\n")

    def start_li(self, attrs):
        if self._in_ol:
            self._write("# ")
        else:
            self._write("* ")
        self._start_capture("li")

    def end_li(self):
        self._stop_capture_and_write()
        self._write("\n")

    def start_a(self, attrs):
        self.a_href = attrs.get("href")
        if self.a_href:
            self._write(" \"")
            self._start_capture("a")

    def end_a(self):
        if self.a_href:
            self._stop_capture_and_write()
            self._write(["\":", self.a_href, " "])
            self.a_href = False

    def start_img(self, attrs):
        if attrs.get("src"):
            self._write([" !", attrs["src"], "! "])

    def end_img(self):
        pass

    def start_tr(self, attrs):
        pass

    def end_tr(self):
        self._write("|\n")

    def start_td(self, attrs):
        self._write("|")
        self._start_capture("td")

    def end_td(self):
        self._stop_capture_and_write()
        self._write("|")

    def start_br(self, attrs):
        self._write("\n")

    def unknown_starttag(self, tag, attrs):
        """Delete all other tags except for those specified in valid_tags"""
        if tag in self.valid_tags:
            self._write(["<", tag])
            for k, v in attrs:
                if k in self.valid_attrs:
                    self._write([" ", k, "=\"", v, "\""])
            self._write(">")

    def unknown_endtag(self, tag):
        if tag in self.valid_tags:
            self._write(["</", tag, ">"])

    def _get_result(self):
        return "".join(self._result).strip()

    result = property(_get_result)


def html2textile(s):
    """Convert a snippet of HTML to Textile, a simple markup language. See
    http://www.textism.com/tools/textile/ for Textile's rules.

    >>> html2textile("<h1>Hello world!</h1>")
    'h1. Hello world!'

    >>> html2textile("<h1>Hello <strong>world</strong>!</h1>")
    'h1. Hello *world*!'

    >>> html2textile('<h1>Hello <a href="http://www.google.com/">world</a>!</h1>')
    'h1. Hello "world":http://www.google.com/!'

    >>> html2textile('<img src="http://www.google.com/intl/en/images/logo.gif" \
    ...     width="276" height="110" alt="Google logo">')
    '!http://www.google.com/intl/en/images/logo.gif!'

    >>> html2textile('<h1>Hello world!</h1><p>Welcome to my home page.</p>')
    'h1. Hello world!\\n\\np. Welcome to my home page.'
    """
    parser = HtmlToTextileConvertingParser()
    parser.feed(s)
    parser.close()
    return parser.result


def convert(infile, outdir, authorDirs, categoryDirs):
    """Convert Wordpress Export File to multiple html files.

    Keyword arguments:
    infile -- the location of the Wordpress Export File
    outdir -- the directory where the files will be created
    authorDirs -- if true, create different directories for each author
    categoryDirs -- if true, create directories for each category

    """


    # First we parse the XML file into a list of posts.
    # Each post is a dictionary

    dom = minidom.parse(infile)

    blog = [] # list that will contain all posts

    for node in dom.getElementsByTagName('item'):
    	post = dict()

    	post["title"] = node.getElementsByTagName('title')[0].firstChild.data
    	post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
    	post["author"] = node.getElementsByTagName(
    	                'dc:creator')[0].firstChild.data
    	post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data

    	if node.getElementsByTagName('content:encoded')[0].firstChild != None:
    	    post["text"] = node.getElementsByTagName(
    	                    'content:encoded')[0].firstChild.data
    	else:
    	    post["text"] = ""

    	# wp:attachment_url could be use to download attachments

    	# Get the categories
    	tempCategories = []
    	for subnode in node.getElementsByTagName('category'):
    		 tempCategories.append(subnode.getAttribute('nicename'))
    	categories = [x for x in tempCategories if x != '']
    	post["categories"] = categories

    	# Add post to the list of all posts
    	blog.append(post)


    # Then we create the directories and HTML files from the list of posts.

    # The "base" directory
    outdir += "/wordpress/"
    if os.path.exists(outdir) == False:
        os.makedirs(outdir)
    os.chdir(outdir)

    for post in blog:
        # The "category" directories
        path = ""
        if authorDirs == True:
            path += post["author"].encode('utf-8') + "/"

        # This creates a path for the file in the format
        # category1/category2/category3/file. Note that the category list was
        # sorted.

        if categoryDirs == True:
            if (post["categories"] != None):
                path += string.join(post["categories"],"/")

        if os.path.exists(path) == False and path != "":
            os.makedirs(path)

        date = 	time.strptime(post["date"],"%a, %d %b %Y %H:%M:%S +0000")
        file_date = time.strftime("%Y-%m-%d", date)
        post_date = time.strftime("%d %b %Y", date)

        # And finally the file itself
        path = outdir + path
        title = post["title"].encode('utf-8')
        logging.debug("title: " + title)
        filename = path + "/" + file_date + ' - ' + title.replace(" ", "-").lower() + '.textile'

        f = open(filename, 'w')

        # Add "HTML header"
        start = "---\ntemplate: post\ntitle: "+ title +"\n---\n\nh1. {{ page.title }}\n\np(meta). " + post_date + "\n\n"
        f.write(start)

        # Convert the unicode object to a string that can be written to a file
        # with the proper encoding (UTF-8)
        text = post["text"].encode('utf-8')

        text = text.replace("\n", "\n\n")

        f.write(html2textile(text))

        f.close()

def usage(pname):
    """Displays usage information

    keyword arguments:
    pname -- program name (e.g. obtained as argv[0])

    """


    print """python %s [-hac] [-o outdir] infile
    Converts a Wordpress Export File to multiple html files.

    Options:
        -h,--help\tDisplays this information.
        -a,--authors\tCreate different directories for each author.
        -c,--categories\tCreate directory structure from post categories.
        -o,--outdir\tSpecify a directory for the output.

    Example:
    python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml
        """ % (pname, pname)


def main(argv):
    outdir = ""
    authors = False
    categories = False

    try:
		opts, args = getopt.getopt(
		    argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"])
    except getopt.GetoptError, err:
		print str(err)
		usage(argv[0])
		sys.exit(2)

    for opt, arg in opts:
		if opt in ("-h", "--help"):
			usage(argv[0])
			sys.exit()
		elif opt in ("-a", "--authors"):
			authors = True
		elif opt in ("-c", "--categories"):
		    categories = True
		elif opt in ("-o", "--outdir"):
		    outdir = arg

    infile = "".join(args)

    if infile == "":
	    print "Error: Missing Argument: missing wordpress export file."
	    usage(argv[0])
	    sys.exit(3)

    if outdir == "":
	    # Use the current directory
	    outdir = os.getcwd()

    convert(infile, outdir, authors, categories)


if __name__ == "__main__":
	main(sys.argv)
	#!/usr/bin/env python

	import sgmllib, re, time, string, os, sys, getopt, logging
	from xml.dom import minidom

	#
	# Script to transform a wordpress export file into multiple textile files
	#
	# Usage:
	# pyhton wordpress-export-to-textile.py -o ~/output-dir/ ~/path-to-file/wordpress-export.xml
	#
	# The script will crash on unexpected unicode chars
	# you can clean it in the last post displayed by the logging debug.
	#
	# If you've got new ideas, edits are welcomed on this gist
	#


	__author__ = 'Luis Rei <luis.rei@gmail.com>'
	__homepage__ = 'http://luisrei.com'
	__version__ = '1.0'
	__date__ = '2008/03/23'

	# Log everything, and send it to stderr.
	logging.basicConfig(level=logging.DEBUG)

	whitespace_re = re.compile("\s+")


	def normalise_space(s):
	"""Normalise space in the same manner as HTML. Any substring of multiple
	whitespace characters will be replaced with a single space char.

	"""
	return whitespace_re.sub(" ", str(s))


	def make_block_start_end_pair(tag):
	def start_t(self, attrs):
	self._write("%s. " % tag)
	self._start_capture(tag)
	def end_t(self):
	self._stop_capture_and_write()
	self._write("\n\n")
	return start_t, end_t


	def make_quicktag_start_end_pair(tag, wrapchar):
	def start_t(self, attrs):
	self._write([" ", wrapchar])
	self._start_capture(tag)
	def end_t(self):
	self._stop_capture_and_write()
	self._write([wrapchar, " "])
	return start_t, end_t


	class HtmlToTextileConvertingParser(sgmllib.SGMLParser):
	"""An SGML parser class which traverses the tree and converts HTML tags into
	Textile markup. Block tags within block tags are ignored.

	"""
	valid_tags = ()
	valid_attrs = ()
	block_tags = ("h1", "h2", "h3", "h4", "h5", "h6", "h7", "p", "bq")

	from htmlentitydefs import entitydefs

	def __init__(self):
	sgmllib.SGMLParser.__init__(self)
	self._result = []
	self._data_stack = []
	self._in_block = self._in_ul = self._in_ol = False

	def handle_data(self, data):
	if data:
	self._write(normalise_space(data).strip())

	def handle_charref(self, tag):
	self._write(unichr(int(tag)))

	def handle_entityref(self, tag):
	if self.entitydefs.has_key(tag):
	self._write(self.entitydefs[tag])

	def handle_starttag(self, tag, method, attrs):
	method(dict(attrs))

	def _write(self, d):
	if len(self._data_stack) < 2:
	target = self._result
	else:
	target = self._data_stack[-1]
	if type(d) in (list, tuple):
	target += d
	else:
	target.append(str(d))

	def _start_capture(self, tag):
	self._in_block = tag
	self._data_stack.append([])

	def _stop_capture_and_write(self):
	self._in_block = False
	self._write(self._data_stack.pop())

	start_h1, end_h1 = make_block_start_end_pair("h1")
	start_h2, end_h2 = make_block_start_end_pair("h2")
	start_h3, end_h3 = make_block_start_end_pair("h3")
	start_h4, end_h4 = make_block_start_end_pair("h4")
	start_h5, end_h5 = make_block_start_end_pair("h5")
	start_h6, end_h6 = make_block_start_end_pair("h6")
	start_h7, end_h7 = make_block_start_end_pair("h7")
	start_p, end_p = make_block_start_end_pair("p")
	start_blockquote, end_blockquote = make_block_start_end_pair("bq")

	start_b, end_b = make_quicktag_start_end_pair("b", "*")
	start_strong, end_strong = make_quicktag_start_end_pair("strong", "*")
	start_i, end_i = make_quicktag_start_end_pair("i", "_")
	start_em, end_em = make_quicktag_start_end_pair("em", "_")
	start_cite, end_cite = make_quicktag_start_end_pair("cite", "??")
	start_s, end_s = make_quicktag_start_end_pair("s", "-")
	start_sup, end_sup = make_quicktag_start_end_pair("sup", "^")
	start_sub, end_sub = make_quicktag_start_end_pair("sub", "~")

	def start_p(self, attrs):
	self._start_capture("p")

	def end_p(self):
	self._stop_capture_and_write()
	self._write("\n\n")

	def start_ol(self, attrs):
	self._in_ol = True

	def end_ol(self):
	self._in_ol = False
	self._write("\n")

	def start_ul(self, attrs):
	self._in_ul = True

	def end_ul(self):
	self._in_ul = False
	self._write("\n")

	def start_li(self, attrs):
	if self._in_ol:
	self._write("# ")
	else:
	self._write("* ")
	self._start_capture("li")

	def end_li(self):
	self._stop_capture_and_write()
	self._write("\n")

	def start_a(self, attrs):
	self.a_href = attrs.get("href")
	if self.a_href:
	self._write(" \"")
	self._start_capture("a")

	def end_a(self):
	if self.a_href:
	self._stop_capture_and_write()
	self._write(["\":", self.a_href, " "])
	self.a_href = False

	def start_img(self, attrs):
	if attrs.get("src"):
	self._write([" !", attrs["src"], "! "])

	def end_img(self):
	pass

	def start_tr(self, attrs):
	pass

	def end_tr(self):
	self._write("\|\n")

	def start_td(self, attrs):
	self._write("\|")
	self._start_capture("td")

	def end_td(self):
	self._stop_capture_and_write()
	self._write("\|")

	def start_br(self, attrs):
	self._write("\n")

	def unknown_starttag(self, tag, attrs):
	"""Delete all other tags except for those specified in valid_tags"""
	if tag in self.valid_tags:
	self._write(["<", tag])
	for k, v in attrs:
	if k in self.valid_attrs:
	self._write([" ", k, "=\"", v, "\""])
	self._write(">")

	def unknown_endtag(self, tag):
	if tag in self.valid_tags:
	self._write(["</", tag, ">"])

	def _get_result(self):
	return "".join(self._result).strip()

	result = property(_get_result)


	def html2textile(s):
	"""Convert a snippet of HTML to Textile, a simple markup language. See
	http://www.textism.com/tools/textile/ for Textile's rules.

	>>> html2textile("<h1>Hello world!</h1>")
	'h1. Hello world!'

	>>> html2textile("<h1>Hello <strong>world</strong>!</h1>")
	'h1. Hello world!'

	>>> html2textile('<h1>Hello <a href="http://www.google.com/">world</a>!</h1>')
	'h1. Hello "world":http://www.google.com/!'

	>>> html2textile('<img src="http://www.google.com/intl/en/images/logo.gif" \
	... width="276" height="110" alt="Google logo">')
	'!http://www.google.com/intl/en/images/logo.gif!'

	>>> html2textile('<h1>Hello world!</h1><p>Welcome to my home page.</p>')
	'h1. Hello world!\\n\\np. Welcome to my home page.'
	"""
	parser = HtmlToTextileConvertingParser()
	parser.feed(s)
	parser.close()
	return parser.result


	def convert(infile, outdir, authorDirs, categoryDirs):
	"""Convert Wordpress Export File to multiple html files.

	Keyword arguments:
	infile -- the location of the Wordpress Export File
	outdir -- the directory where the files will be created
	authorDirs -- if true, create different directories for each author
	categoryDirs -- if true, create directories for each category

	"""


	# First we parse the XML file into a list of posts.
	# Each post is a dictionary

	dom = minidom.parse(infile)

	blog = [] # list that will contain all posts

	for node in dom.getElementsByTagName('item'):
	post = dict()

	post["title"] = node.getElementsByTagName('title')[0].firstChild.data
	post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
	post["author"] = node.getElementsByTagName(
	'dc:creator')[0].firstChild.data
	post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data

	if node.getElementsByTagName('content:encoded')[0].firstChild != None:
	post["text"] = node.getElementsByTagName(
	'content:encoded')[0].firstChild.data
	else:
	post["text"] = ""

	# wp:attachment_url could be use to download attachments

	# Get the categories
	tempCategories = []
	for subnode in node.getElementsByTagName('category'):
	tempCategories.append(subnode.getAttribute('nicename'))
	categories = [x for x in tempCategories if x != '']
	post["categories"] = categories

	# Add post to the list of all posts
	blog.append(post)


	# Then we create the directories and HTML files from the list of posts.

	# The "base" directory
	outdir += "/wordpress/"
	if os.path.exists(outdir) == False:
	os.makedirs(outdir)
	os.chdir(outdir)

	for post in blog:
	# The "category" directories
	path = ""
	if authorDirs == True:
	path += post["author"].encode('utf-8') + "/"

	# This creates a path for the file in the format
	# category1/category2/category3/file. Note that the category list was
	# sorted.

	if categoryDirs == True:
	if (post["categories"] != None):
	path += string.join(post["categories"],"/")

	if os.path.exists(path) == False and path != "":
	os.makedirs(path)

	date = time.strptime(post["date"],"%a, %d %b %Y %H:%M:%S +0000")
	file_date = time.strftime("%Y-%m-%d", date)
	post_date = time.strftime("%d %b %Y", date)

	# And finally the file itself
	path = outdir + path
	title = post["title"].encode('utf-8')
	logging.debug("title: " + title)
	filename = path + "/" + file_date + ' - ' + title.replace(" ", "-").lower() + '.textile'

	f = open(filename, 'w')

	# Add "HTML header"
	start = "---\ntemplate: post\ntitle: "+ title +"\n---\n\nh1. {{ page.title }}\n\np(meta). " + post_date + "\n\n"
	f.write(start)

	# Convert the unicode object to a string that can be written to a file
	# with the proper encoding (UTF-8)
	text = post["text"].encode('utf-8')

	text = text.replace("\n", "\n\n")

	f.write(html2textile(text))

	f.close()

	def usage(pname):
	"""Displays usage information

	keyword arguments:
	pname -- program name (e.g. obtained as argv[0])

	"""


	print """python %s [-hac] [-o outdir] infile
	Converts a Wordpress Export File to multiple html files.

	Options:
	-h,--help\tDisplays this information.
	-a,--authors\tCreate different directories for each author.
	-c,--categories\tCreate directory structure from post categories.
	-o,--outdir\tSpecify a directory for the output.

	Example:
	python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml
	""" % (pname, pname)


	def main(argv):
	outdir = ""
	authors = False
	categories = False

	try:
	opts, args = getopt.getopt(
	argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"])
	except getopt.GetoptError, err:
	print str(err)
	usage(argv[0])
	sys.exit(2)

	for opt, arg in opts:
	if opt in ("-h", "--help"):
	usage(argv[0])
	sys.exit()
	elif opt in ("-a", "--authors"):
	authors = True
	elif opt in ("-c", "--categories"):
	categories = True
	elif opt in ("-o", "--outdir"):
	outdir = arg

	infile = "".join(args)

	if infile == "":
	print "Error: Missing Argument: missing wordpress export file."
	usage(argv[0])
	sys.exit(3)

	if outdir == "":
	# Use the current directory
	outdir = os.getcwd()

	convert(infile, outdir, authors, categories)


	if __name__ == "__main__":
	main(sys.argv)