Skip to content

Instantly share code, notes, and snippets.

@robgjansen
Created November 5, 2015 02:46
Show Gist options
  • Save robgjansen/d1d0bfce08b221a74b37 to your computer and use it in GitHub Desktop.
Save robgjansen/d1d0bfce08b221a74b37 to your computer and use it in GitHub Desktop.
Converts an XML dump from blogger into posts for Octopress. Title, published date, updated date, and content are kept; everything else including comments are ignored.
#!/usr/bin/env python
import sys, os
from lxml import etree
'''
USAGE: python blogger2octopress.py blogger_dump.xml
Then check the _drafts and _posts folders.
Converts an XML dump from blogger into posts for Octopress.
Title, published date, updated date, and content are kept;
everything else including comments are ignored.
'''
def main():
tree = etree.parse(sys.argv[1])
ns = {"atom": "http://www.w3.org/2005/Atom", "app": "http://purl.org/atom/app#"}
for entry in tree.xpath('//atom:entry', namespaces=ns):
is_post, is_comment, is_page = False, False, False
tags = []
for category in entry.xpath('.//atom:category', namespaces=ns):
for attr_val in category.xpath('@term', namespaces=ns):
if 'kind#post' in attr_val:
is_post = True
elif 'kind#comment' in attr_val:
is_comment = True
elif 'kind#page' in attr_val:
is_page = True
elif 'kind#settings' in attr_val or 'kind#template' in attr_val:
pass
elif 'kind#' in attr_val:
raise Exception("Whooops! unkown type of entry: " + attr_val)
else:
tags.append(attr_val)
if is_post:
id = entry.xpath('.//atom:id', namespaces=ns)[0]
print id.text
title = entry.xpath('.//atom:title', namespaces=ns)[0]
published = entry.xpath('.//atom:published', namespaces=ns)[0]
updated = entry.xpath('.//atom:updated', namespaces=ns)[0]
content = entry.xpath('.//atom:content', namespaces=ns)[0]
is_draft = False
draft = entry.xpath(".//app:control/app:draft", namespaces=ns)
if len(draft) > 0:
if 'yes' == draft[0].text:
is_draft = True
ids = id.text.split(',')[1].split(':')[1].split('.')
blogid, postid = ids[0], ids[1]
if title.text is None: title.text = "Untitled"
title.text = ''.join(e for e in title.text if e == '-' or e.isspace() or e.isalnum())
prefix = "_drafts" if is_draft else "_posts"
parts = title.text.split()
name_parts = parts[:min(len(parts), 6)]
date_part = published.text.split('T')[0]
filename = "{0}/{1}-{2}.html".format(prefix, date_part, '-'.join(name_parts))
if not os.path.exists(prefix): os.mkdir(prefix)
with open(filename, 'wb') as out:
print >>out, "---"
print >>out, "layout: post"
print >>out, "title: '{0}'".format(title.text)
print >>out, "date: {0}".format(published.text)
print >>out, "updated: {0}".format(updated.text)
print >>out, "blog-id: {0}".format(blogid)
print >>out, "post-id: {0}".format(postid)
print >>out, "---"
print >>out, content.text.encode('utf-8')
if __name__ == "__main__": sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment