Created
November 5, 2015 02:46
-
-
Save robgjansen/d1d0bfce08b221a74b37 to your computer and use it in GitHub Desktop.
Converts an XML dump from blogger into posts for Octopress. Title, published date, updated date, and content are kept; everything else including comments are ignored.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys, os | |
from lxml import etree | |
''' | |
USAGE: python blogger2octopress.py blogger_dump.xml | |
Then check the _drafts and _posts folders. | |
Converts an XML dump from blogger into posts for Octopress. | |
Title, published date, updated date, and content are kept; | |
everything else including comments are ignored. | |
''' | |
def main(): | |
tree = etree.parse(sys.argv[1]) | |
ns = {"atom": "http://www.w3.org/2005/Atom", "app": "http://purl.org/atom/app#"} | |
for entry in tree.xpath('//atom:entry', namespaces=ns): | |
is_post, is_comment, is_page = False, False, False | |
tags = [] | |
for category in entry.xpath('.//atom:category', namespaces=ns): | |
for attr_val in category.xpath('@term', namespaces=ns): | |
if 'kind#post' in attr_val: | |
is_post = True | |
elif 'kind#comment' in attr_val: | |
is_comment = True | |
elif 'kind#page' in attr_val: | |
is_page = True | |
elif 'kind#settings' in attr_val or 'kind#template' in attr_val: | |
pass | |
elif 'kind#' in attr_val: | |
raise Exception("Whooops! unkown type of entry: " + attr_val) | |
else: | |
tags.append(attr_val) | |
if is_post: | |
id = entry.xpath('.//atom:id', namespaces=ns)[0] | |
print id.text | |
title = entry.xpath('.//atom:title', namespaces=ns)[0] | |
published = entry.xpath('.//atom:published', namespaces=ns)[0] | |
updated = entry.xpath('.//atom:updated', namespaces=ns)[0] | |
content = entry.xpath('.//atom:content', namespaces=ns)[0] | |
is_draft = False | |
draft = entry.xpath(".//app:control/app:draft", namespaces=ns) | |
if len(draft) > 0: | |
if 'yes' == draft[0].text: | |
is_draft = True | |
ids = id.text.split(',')[1].split(':')[1].split('.') | |
blogid, postid = ids[0], ids[1] | |
if title.text is None: title.text = "Untitled" | |
title.text = ''.join(e for e in title.text if e == '-' or e.isspace() or e.isalnum()) | |
prefix = "_drafts" if is_draft else "_posts" | |
parts = title.text.split() | |
name_parts = parts[:min(len(parts), 6)] | |
date_part = published.text.split('T')[0] | |
filename = "{0}/{1}-{2}.html".format(prefix, date_part, '-'.join(name_parts)) | |
if not os.path.exists(prefix): os.mkdir(prefix) | |
with open(filename, 'wb') as out: | |
print >>out, "---" | |
print >>out, "layout: post" | |
print >>out, "title: '{0}'".format(title.text) | |
print >>out, "date: {0}".format(published.text) | |
print >>out, "updated: {0}".format(updated.text) | |
print >>out, "blog-id: {0}".format(blogid) | |
print >>out, "post-id: {0}".format(postid) | |
print >>out, "---" | |
print >>out, content.text.encode('utf-8') | |
if __name__ == "__main__": sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment