Skip to content

Instantly share code, notes, and snippets.

@ronjouch
Forked from Reiot/wordpress2octopress.py
Created April 15, 2012 19:02
Show Gist options
  • Save ronjouch/2394347 to your computer and use it in GitHub Desktop.
Save ronjouch/2394347 to your computer and use it in GitHub Desktop.
Blogger xml ==> Chisel markdown migration script
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
import os
import re
import yaml
import urllib
import codecs
from datetime import datetime
from BeautifulSoup import BeautifulStoneSoup, Comment
# save markdown to single LOGFILE
DEBUG = False
XML = "blog-pretty6.xml"
EXPORT_ROOT = 'source/'
MARKDOWN_FORMAT = '%04d-%02d-%02d-%s.md'
LOGFILE = "log.md"
EXCLUDE_METAS = [
u'_edit_last',
u'superawesome',
u'delicious',
u'_wp_page_template',
]
EXCLUDE_CATEGORIES = [
u'Uncategorized',
]
EXCLUDE_TAGS = []
def to_markdown(txt):
matches1 = [
[r'&', '&'],
[r'&lt;', '<'],
[r'&gt;', '>'],
[r'&nbsp;', ' '],
[r'&quot;', '"'],
[r'&#\d+;',''], # &#NNNN; some html entities
[r'\*', '\*']
]
matches2 = [
[r'<img(.+?)src="([^"]+)"+alt="([^"]+)"[^>]*?/>', r'![\3](\2)'],
[r'<img(.+?)src="([^"]+)"[^>]*?/>', r'![](\2)']
]
matches3 = [
[r'</?strong>', '**'],
[r'</?b>', '**'],
[r'</?em>', '*'],
[r'</?i>', '*'],
[r'<h1>', '# '],
[r'<h2>', '## '],
[r'<h3>', '### '],
[r'<h4>', '#### '],
[r'<h5>', '##### '],
[r'<h6>', '###### '],
[r'</h\d>', '\n'],
[r'</?p[^>]*>', '\n'], # <p class=blahblah>..</p>
[r'</?span[^>]*>',''], # <span class=blahblah>..</span>
[r'<br\s*/?>', ' \n'],
[r' {3,}', ' '],
[r'<a.+?href="([^"]+)"[^>]*>([^<]+)</a>', r'[\2](\1)'],
[r'<li>', '\n- '],
[r'</li>', ''],
[r'</?ul>', '\n'],
[r'</?ol>', '\n'],
[r'\n{3}', '\n\n'],
[r'</?u>', ''],
# remove custom tags found in xml
[r'<a href="http://picasa.google.com/blogger/".*?</a>',''],
[r'<div class="blogger-post-footer">.+</div>',''],
[r'<div class="separator".*>(.*)</div>',r'\1'],
[r'<div xmlns.*>((.|\n)*)</div>',r'\1'],
# liquid tag conflict. but {{}} is manually fixed :P
[r'{%(.+?)%}', r'{{ "{% \1 "}} %}'],
# octopress plugin
#[r'\[sourcecode language=["\']([^"\']+)["\']\]', r'{% codeblock lang:\1 %}'],
#[r'\[/sourcecode\]', r'{% endcodeblock %}'],
[r'\[code\]', r'```\n'],
[r'\[/code\]', r'```'],
[r'<pre><code>', r'```\n'],
[r'</code></pre>', r'```'],
[r'<pre>', r'```\n'],
[r'</?pre>', r'```'],
#[r'</?code>', r'```'],
#[r'<iframe', r'\n\n<iframe'],
[r'<blockquote.*?>', r'\n<quote>'],
[r'</blockquote>', r'</quote>\n\n'],
]
for match in matches1:
txt = re.sub(match[0], match[1], txt)
for match in matches2:
txt = re.sub(match[0], match[1], txt)
for match in matches3:
txt = re.sub(match[0], match[1], txt)
return txt
def slugify(title):
# this may break permlink.... please checkout removal of some chars.
title = title.strip().lower()
matches = [
[r"[,.]", ''],
[r" ", '-'],
]
for match in matches:
title = re.sub(match[0], match[1], title)
return title
def parse_item(item):
# pub_date = item.find("pubDate") # some old posts have missing year. ex> Wed, 30 Nov -0001 00:00:00 +0000
# creator = item.find("dc:creator") # always me
# guid = item.find("guid") # original imported URL. can be None. isPermaLink alwase false
# description = item.find("description") # i never use this :P
# excerpt = item.find("excerpt:encoded") # i never use this :P
# wp_post_id = item.find("wp:post_id") # integer
# wp_post_date_gmt = item.find("wp:post_date_gmt") # sometimes 0000-00-00 00:00:00
# wp_ping_status = item.find("wp:ping_status") # open, closed
# wp_post_parent = item.find("wp:post_parent") # wp:post_id
# wp_menu_order = item.find("wp:menu_order") # integer?
# wp_is_stiky = item.find("wp:is_sticky") # 0 or 1 ?
def _(node):
if not node or not node.string:
return u''
u = unicode(node.string)
if u.startswith(u'<![CDATA['):
u = u[9:-3]
return u
# ex> post, page, attachment, custom_dns(for dns service)
wp_post_type = "post" #_(item.find("wp:post_type"))
#if wp_post_type not in (u'post', u'page'):
# return
# ex> draft, auto-draft, private, publish, attachment, inherit(for attachment)
wp_status = "publish" #_(item.find("wp:status"))
#if wp_status == u'attachment':
# return
title = _(item.find("title"))
title = title.replace("\\","") # backslash raise error on yaml string
wp_post_date = _(item.find("published"))
post_date = datetime.strptime(wp_post_date,"%Y-%m-%dT%H:%M:%S")
# slug can be null or quoted already (if cjk title)
slug = _(item.find("title"))
if not slug:
slug = slugify(title)
else:
slug = urllib.unquote(slug.encode('utf-8')).decode('utf-8')
assert isinstance(slug, unicode), 'slug should be unicode'
slugForTitle = slug.lower().replace(" - ","-").replace(" ","-").replace(",","").replace("...","").replace(".","-").replace("?","").replace("!","").replace("'","").replace("\"","").replace("+","-").replace(":","-").replace("@","a").replace("[","").replace("]","").replace("(","").replace(")","").replace(u"ê","e").replace(u"ô","o").replace(u"é","e").replace(u"à","a").replace(u"ê","e").replace(u"ç","c").replace(u"ë","e").replace("&amp;","-")
filename = u'%04d-%02d-%02d-%s.md'%(post_date.year, post_date.month, post_date.day, slugForTitle)
if DEBUG:
out = codecs.open( LOGFILE, "a", "utf-8")
out.write(u'\n_%ss/%s\n'% (wp_post_type, filename))
else:
path = os.path.join(u"../posts")
if not os.access( path, os.F_OK ):
os.mkdir( path )
try:
out = codecs.open( os.path.join(path, filename), "w", "utf-8")
except UnicodeDecodeError, e:
print 'UnicodeDecodeError:', str(e), 'in', post_date, _(item.find("title"))
print 'slug', type(slug), 'filename', type(filename), 'path', type(path)
return
# starting yaml header
#out.write(u'---\n')
# post or page layout template
#out.write(u'layout: %s\n'% wp_post_type)
# sometimes title contains html entities like &amp; &lt; &gt; ...
out.write(u'%s\n'% title.lower())
# NOTE bulk-imported posts have same datetime!
#dttime = str(post_date.year) + "-" + str(post_date.month) + "-" + str(post_date.day)
#postdateForChisel = str(post_date)[:-9]
#out.write(u'%s\n'% postdateForChisel)
# perm link? normally contains original link.
#link = _(item.find("link"))
#out.write(u'link: %s\n'% link)
tags = []
for tag in item.findAll("category",{"domain":"tag"}):
tags.append(_(tag))
tags = list(set([t for t in tags if t not in EXCLUDE_TAGS]))
if tags:
out.write(u'tags:\n')
for tag in tags:
out.write(u'- %s\n'% tag)
categories = []
for category in item.findAll("category",{"domain":"category"}):
categories.append(_(category))
categories = list(set([c for c in categories if c not in EXCLUDE_CATEGORIES]))
if categories:
out.write(u'categories:\n')
for category in categories:
out.write(u'- %s\n'% category)
# some metas are useless...
metas = {}
for meta in item.findAll("wp:post_meta"):
meta_key = _(meta.find("wp:meta_key"))
meta_value = _(meta.find("wp:meta_value"))
if meta_key not in EXCLUDE_METAS:
metas[meta_key] = meta_key
if metas:
out.write(u'meta:\n')
for k, v in metas:
out.write(u' %s: %s\n'% (k, v))
#out.write(u'status: %s\n'% wp_status)
# octopress will skip unpublished posts.
#out.write(u'published: %s\n'% ('true' if wp_status == u'publish' else 'false'))
# octopress will not show comment input??
# ex> open, closed
#wp_comment_status = _(item.find("wp:comment_status"))
#out.write(u'comments: %s\n'% ('true' if wp_comment_status == u'open' else 'false'))
# end of yaml header
out.write(u'\n')
content = _(item.find("description"))
content = to_markdown(content.strip())
out.write(content)
out.close()
if __name__ == '__main__':
if DEBUG:
if os.access(LOGFILE, os.F_OK ):
os.remove(LOGFILE)
# if len(sys.argv) > 1:
# XML = sys.argv[1]
print 'loading...'
soup = BeautifulStoneSoup(open(XML))
print 'parsing...'
for item in soup.findAll("item"):
parse_item(item)
print 'done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment