-
-
Save ronjouch/2394347 to your computer and use it in GitHub Desktop.
Blogger xml ==> Chisel markdown migration script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/usr/bin/env python | |
import sys | |
import os | |
import re | |
import yaml | |
import urllib | |
import codecs | |
from datetime import datetime | |
from BeautifulSoup import BeautifulStoneSoup, Comment | |
# save markdown to single LOGFILE | |
DEBUG = False | |
XML = "blog-pretty6.xml" | |
EXPORT_ROOT = 'source/' | |
MARKDOWN_FORMAT = '%04d-%02d-%02d-%s.md' | |
LOGFILE = "log.md" | |
EXCLUDE_METAS = [ | |
u'_edit_last', | |
u'superawesome', | |
u'delicious', | |
u'_wp_page_template', | |
] | |
EXCLUDE_CATEGORIES = [ | |
u'Uncategorized', | |
] | |
EXCLUDE_TAGS = [] | |
def to_markdown(txt): | |
matches1 = [ | |
[r'&', '&'], | |
[r'<', '<'], | |
[r'>', '>'], | |
[r' ', ' '], | |
[r'"', '"'], | |
[r'&#\d+;',''], # &#NNNN; some html entities | |
[r'\*', '\*'] | |
] | |
matches2 = [ | |
[r'<img(.+?)src="([^"]+)"+alt="([^"]+)"[^>]*?/>', r'![\3](\2)'], | |
[r'<img(.+?)src="([^"]+)"[^>]*?/>', r'![](\2)'] | |
] | |
matches3 = [ | |
[r'</?strong>', '**'], | |
[r'</?b>', '**'], | |
[r'</?em>', '*'], | |
[r'</?i>', '*'], | |
[r'<h1>', '# '], | |
[r'<h2>', '## '], | |
[r'<h3>', '### '], | |
[r'<h4>', '#### '], | |
[r'<h5>', '##### '], | |
[r'<h6>', '###### '], | |
[r'</h\d>', '\n'], | |
[r'</?p[^>]*>', '\n'], # <p class=blahblah>..</p> | |
[r'</?span[^>]*>',''], # <span class=blahblah>..</span> | |
[r'<br\s*/?>', ' \n'], | |
[r' {3,}', ' '], | |
[r'<a.+?href="([^"]+)"[^>]*>([^<]+)</a>', r'[\2](\1)'], | |
[r'<li>', '\n- '], | |
[r'</li>', ''], | |
[r'</?ul>', '\n'], | |
[r'</?ol>', '\n'], | |
[r'\n{3}', '\n\n'], | |
[r'</?u>', ''], | |
# remove custom tags found in xml | |
[r'<a href="http://picasa.google.com/blogger/".*?</a>',''], | |
[r'<div class="blogger-post-footer">.+</div>',''], | |
[r'<div class="separator".*>(.*)</div>',r'\1'], | |
[r'<div xmlns.*>((.|\n)*)</div>',r'\1'], | |
# liquid tag conflict. but {{}} is manually fixed :P | |
[r'{%(.+?)%}', r'{{ "{% \1 "}} %}'], | |
# octopress plugin | |
#[r'\[sourcecode language=["\']([^"\']+)["\']\]', r'{% codeblock lang:\1 %}'], | |
#[r'\[/sourcecode\]', r'{% endcodeblock %}'], | |
[r'\[code\]', r'```\n'], | |
[r'\[/code\]', r'```'], | |
[r'<pre><code>', r'```\n'], | |
[r'</code></pre>', r'```'], | |
[r'<pre>', r'```\n'], | |
[r'</?pre>', r'```'], | |
#[r'</?code>', r'```'], | |
#[r'<iframe', r'\n\n<iframe'], | |
[r'<blockquote.*?>', r'\n<quote>'], | |
[r'</blockquote>', r'</quote>\n\n'], | |
] | |
for match in matches1: | |
txt = re.sub(match[0], match[1], txt) | |
for match in matches2: | |
txt = re.sub(match[0], match[1], txt) | |
for match in matches3: | |
txt = re.sub(match[0], match[1], txt) | |
return txt | |
def slugify(title): | |
# this may break permlink.... please checkout removal of some chars. | |
title = title.strip().lower() | |
matches = [ | |
[r"[,.]", ''], | |
[r" ", '-'], | |
] | |
for match in matches: | |
title = re.sub(match[0], match[1], title) | |
return title | |
def parse_item(item): | |
# pub_date = item.find("pubDate") # some old posts have missing year. ex> Wed, 30 Nov -0001 00:00:00 +0000 | |
# creator = item.find("dc:creator") # always me | |
# guid = item.find("guid") # original imported URL. can be None. isPermaLink alwase false | |
# description = item.find("description") # i never use this :P | |
# excerpt = item.find("excerpt:encoded") # i never use this :P | |
# wp_post_id = item.find("wp:post_id") # integer | |
# wp_post_date_gmt = item.find("wp:post_date_gmt") # sometimes 0000-00-00 00:00:00 | |
# wp_ping_status = item.find("wp:ping_status") # open, closed | |
# wp_post_parent = item.find("wp:post_parent") # wp:post_id | |
# wp_menu_order = item.find("wp:menu_order") # integer? | |
# wp_is_stiky = item.find("wp:is_sticky") # 0 or 1 ? | |
def _(node): | |
if not node or not node.string: | |
return u'' | |
u = unicode(node.string) | |
if u.startswith(u'<![CDATA['): | |
u = u[9:-3] | |
return u | |
# ex> post, page, attachment, custom_dns(for dns service) | |
wp_post_type = "post" #_(item.find("wp:post_type")) | |
#if wp_post_type not in (u'post', u'page'): | |
# return | |
# ex> draft, auto-draft, private, publish, attachment, inherit(for attachment) | |
wp_status = "publish" #_(item.find("wp:status")) | |
#if wp_status == u'attachment': | |
# return | |
title = _(item.find("title")) | |
title = title.replace("\\","") # backslash raise error on yaml string | |
wp_post_date = _(item.find("published")) | |
post_date = datetime.strptime(wp_post_date,"%Y-%m-%dT%H:%M:%S") | |
# slug can be null or quoted already (if cjk title) | |
slug = _(item.find("title")) | |
if not slug: | |
slug = slugify(title) | |
else: | |
slug = urllib.unquote(slug.encode('utf-8')).decode('utf-8') | |
assert isinstance(slug, unicode), 'slug should be unicode' | |
slugForTitle = slug.lower().replace(" - ","-").replace(" ","-").replace(",","").replace("...","").replace(".","-").replace("?","").replace("!","").replace("'","").replace("\"","").replace("+","-").replace(":","-").replace("@","a").replace("[","").replace("]","").replace("(","").replace(")","").replace(u"ê","e").replace(u"ô","o").replace(u"é","e").replace(u"à","a").replace(u"ê","e").replace(u"ç","c").replace(u"ë","e").replace("&","-") | |
filename = u'%04d-%02d-%02d-%s.md'%(post_date.year, post_date.month, post_date.day, slugForTitle) | |
if DEBUG: | |
out = codecs.open( LOGFILE, "a", "utf-8") | |
out.write(u'\n_%ss/%s\n'% (wp_post_type, filename)) | |
else: | |
path = os.path.join(u"../posts") | |
if not os.access( path, os.F_OK ): | |
os.mkdir( path ) | |
try: | |
out = codecs.open( os.path.join(path, filename), "w", "utf-8") | |
except UnicodeDecodeError, e: | |
print 'UnicodeDecodeError:', str(e), 'in', post_date, _(item.find("title")) | |
print 'slug', type(slug), 'filename', type(filename), 'path', type(path) | |
return | |
# starting yaml header | |
#out.write(u'---\n') | |
# post or page layout template | |
#out.write(u'layout: %s\n'% wp_post_type) | |
# sometimes title contains html entities like & < > ... | |
out.write(u'%s\n'% title.lower()) | |
# NOTE bulk-imported posts have same datetime! | |
#dttime = str(post_date.year) + "-" + str(post_date.month) + "-" + str(post_date.day) | |
#postdateForChisel = str(post_date)[:-9] | |
#out.write(u'%s\n'% postdateForChisel) | |
# perm link? normally contains original link. | |
#link = _(item.find("link")) | |
#out.write(u'link: %s\n'% link) | |
tags = [] | |
for tag in item.findAll("category",{"domain":"tag"}): | |
tags.append(_(tag)) | |
tags = list(set([t for t in tags if t not in EXCLUDE_TAGS])) | |
if tags: | |
out.write(u'tags:\n') | |
for tag in tags: | |
out.write(u'- %s\n'% tag) | |
categories = [] | |
for category in item.findAll("category",{"domain":"category"}): | |
categories.append(_(category)) | |
categories = list(set([c for c in categories if c not in EXCLUDE_CATEGORIES])) | |
if categories: | |
out.write(u'categories:\n') | |
for category in categories: | |
out.write(u'- %s\n'% category) | |
# some metas are useless... | |
metas = {} | |
for meta in item.findAll("wp:post_meta"): | |
meta_key = _(meta.find("wp:meta_key")) | |
meta_value = _(meta.find("wp:meta_value")) | |
if meta_key not in EXCLUDE_METAS: | |
metas[meta_key] = meta_key | |
if metas: | |
out.write(u'meta:\n') | |
for k, v in metas: | |
out.write(u' %s: %s\n'% (k, v)) | |
#out.write(u'status: %s\n'% wp_status) | |
# octopress will skip unpublished posts. | |
#out.write(u'published: %s\n'% ('true' if wp_status == u'publish' else 'false')) | |
# octopress will not show comment input?? | |
# ex> open, closed | |
#wp_comment_status = _(item.find("wp:comment_status")) | |
#out.write(u'comments: %s\n'% ('true' if wp_comment_status == u'open' else 'false')) | |
# end of yaml header | |
out.write(u'\n') | |
content = _(item.find("description")) | |
content = to_markdown(content.strip()) | |
out.write(content) | |
out.close() | |
if __name__ == '__main__': | |
if DEBUG: | |
if os.access(LOGFILE, os.F_OK ): | |
os.remove(LOGFILE) | |
# if len(sys.argv) > 1: | |
# XML = sys.argv[1] | |
print 'loading...' | |
soup = BeautifulStoneSoup(open(XML)) | |
print 'parsing...' | |
for item in soup.findAll("item"): | |
parse_item(item) | |
print 'done' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment