robgjansen/blogger2octopress.py

## blogger2octopress.py
#!/usr/bin/env python

import sys, os
from lxml import etree

'''
USAGE: python blogger2octopress.py blogger_dump.xml

Then check the _drafts and _posts folders.

Converts an XML dump from blogger into posts for Octopress.
Title, published date, updated date, and content are kept;
everything else including comments are ignored.
'''


def main():
    tree = etree.parse(sys.argv[1])

    ns = {"atom": "http://www.w3.org/2005/Atom", "app": "http://purl.org/atom/app#"}
    for entry in tree.xpath('//atom:entry', namespaces=ns):
        is_post, is_comment, is_page = False, False, False
        tags = []

        for category in entry.xpath('.//atom:category', namespaces=ns):
            for attr_val in category.xpath('@term', namespaces=ns):
                if 'kind#post' in attr_val:
                    is_post = True
                elif 'kind#comment' in attr_val:
                    is_comment = True
                elif 'kind#page' in attr_val:
                    is_page = True
                elif 'kind#settings' in attr_val or 'kind#template' in attr_val:
                    pass
                elif 'kind#' in attr_val:
                    raise Exception("Whooops! unkown type of entry: " + attr_val)
                else:
                    tags.append(attr_val)

        if is_post:
            id = entry.xpath('.//atom:id', namespaces=ns)[0]
            print id.text
            title = entry.xpath('.//atom:title', namespaces=ns)[0]
            published = entry.xpath('.//atom:published', namespaces=ns)[0]
            updated = entry.xpath('.//atom:updated', namespaces=ns)[0]
            content = entry.xpath('.//atom:content', namespaces=ns)[0]

            is_draft = False
            draft = entry.xpath(".//app:control/app:draft", namespaces=ns)
            if len(draft) > 0:
                if 'yes' == draft[0].text:
                    is_draft = True

            ids = id.text.split(',')[1].split(':')[1].split('.')
            blogid, postid = ids[0], ids[1]
            if title.text is None: title.text = "Untitled"
            title.text = ''.join(e for e in title.text if e == '-' or e.isspace() or e.isalnum())

            prefix = "_drafts" if is_draft else "_posts"
            parts = title.text.split()
            name_parts = parts[:min(len(parts), 6)]
            date_part = published.text.split('T')[0]
            filename = "{0}/{1}-{2}.html".format(prefix, date_part, '-'.join(name_parts))

            if not os.path.exists(prefix): os.mkdir(prefix)
            with open(filename, 'wb') as out:
                print >>out, "---"
                print >>out, "layout: post"
                print >>out, "title: '{0}'".format(title.text)
                print >>out, "date: {0}".format(published.text)
                print >>out, "updated: {0}".format(updated.text)
                print >>out, "blog-id: {0}".format(blogid)
                print >>out, "post-id: {0}".format(postid)
                print >>out, "---"
                print >>out, content.text.encode('utf-8')

if __name__ == "__main__": sys.exit(main())
	#!/usr/bin/env python

	import sys, os
	from lxml import etree

	'''
	USAGE: python blogger2octopress.py blogger_dump.xml

	Then check the _drafts and _posts folders.

	Converts an XML dump from blogger into posts for Octopress.
	Title, published date, updated date, and content are kept;
	everything else including comments are ignored.
	'''


	def main():
	tree = etree.parse(sys.argv[1])

	ns = {"atom": "http://www.w3.org/2005/Atom", "app": "http://purl.org/atom/app#"}
	for entry in tree.xpath('//atom:entry', namespaces=ns):
	is_post, is_comment, is_page = False, False, False
	tags = []

	for category in entry.xpath('.//atom:category', namespaces=ns):
	for attr_val in category.xpath('@term', namespaces=ns):
	if 'kind#post' in attr_val:
	is_post = True
	elif 'kind#comment' in attr_val:
	is_comment = True
	elif 'kind#page' in attr_val:
	is_page = True
	elif 'kind#settings' in attr_val or 'kind#template' in attr_val:
	pass
	elif 'kind#' in attr_val:
	raise Exception("Whooops! unkown type of entry: " + attr_val)
	else:
	tags.append(attr_val)

	if is_post:
	id = entry.xpath('.//atom:id', namespaces=ns)[0]
	print id.text
	title = entry.xpath('.//atom:title', namespaces=ns)[0]
	published = entry.xpath('.//atom:published', namespaces=ns)[0]
	updated = entry.xpath('.//atom:updated', namespaces=ns)[0]
	content = entry.xpath('.//atom:content', namespaces=ns)[0]

	is_draft = False
	draft = entry.xpath(".//app:control/app:draft", namespaces=ns)
	if len(draft) > 0:
	if 'yes' == draft[0].text:
	is_draft = True

	ids = id.text.split(',')[1].split(':')[1].split('.')
	blogid, postid = ids[0], ids[1]
	if title.text is None: title.text = "Untitled"
	title.text = ''.join(e for e in title.text if e == '-' or e.isspace() or e.isalnum())

	prefix = "_drafts" if is_draft else "_posts"
	parts = title.text.split()
	name_parts = parts[:min(len(parts), 6)]
	date_part = published.text.split('T')[0]
	filename = "{0}/{1}-{2}.html".format(prefix, date_part, '-'.join(name_parts))

	if not os.path.exists(prefix): os.mkdir(prefix)
	with open(filename, 'wb') as out:
	print >>out, "---"
	print >>out, "layout: post"
	print >>out, "title: '{0}'".format(title.text)
	print >>out, "date: {0}".format(published.text)
	print >>out, "updated: {0}".format(updated.text)
	print >>out, "blog-id: {0}".format(blogid)
	print >>out, "post-id: {0}".format(postid)
	print >>out, "---"
	print >>out, content.text.encode('utf-8')

	if __name__ == "__main__": sys.exit(main())