Script to migrate blog posts from Dotclear to Octopress and build rss.xml file to import comments on Disqus
import sys | |
import csv | |
import os | |
import re | |
import phpserialize | |
import xml.sax.saxutils | |
def split_export(source): | |
dest = source + '.d' | |
try: | |
os.mkdir(dest) | |
except: | |
pass | |
with open(source, 'r') as f: | |
out = None | |
for line in f.xreadlines(): | |
if len(line) <= 1 or line.startswith('///'): | |
if out is not None: | |
out.close() | |
out = None | |
continue | |
elif line[0] == '[': | |
space = line.find(' ') | |
table = line[1:space] | |
out = open(os.path.join(dest, table + '.csv'), 'w') | |
out.write(line[space + 1:-2] + '\n') | |
elif out is not None: | |
out.write(line.replace('\\n', '\\\\n').replace('\\r', '\\\\r')) | |
return dest | |
def load_categories(source): | |
categories = {} | |
with open(source, 'rb') as f: | |
reader = csv.DictReader(f, delimiter=',', escapechar='\\') | |
for line in reader: | |
categories[line['cat_id']] = line['cat_title'] | |
return categories | |
url_RE = re.compile(r'\[([^|\]]+)\|([^\]]+)\]') | |
imglink_RE = re.compile(r'\[\(\([^|]+\|([^|]+)(?:\|[^)]+)?\)\)\|(?:/mind)?/public/([^\]]+)\]') | |
img_RE = re.compile(r'\(\((?:/mind)?/public/([^|]+)\|([^|]+)(?:\|[^)]+)?\)\)') | |
exp_RE = re.compile(r'\^\^(.*?)\^\^') | |
def markdownify_line(line): | |
line = line.replace("''", "_").replace(r'%%%', ' ').replace('@@', '`') | |
line = exp_RE.sub(r'<sup>\1</sup>', line) | |
line = imglink_RE.sub(r'', line) | |
line = img_RE.sub(r'', line) | |
line = url_RE.sub(r'[\1](\2)', line) | |
return line | |
def pygment_lang(name): | |
if name == 'c_mac': | |
return 'objective-c' | |
elif name == 'html4strict': | |
return 'html' | |
return name | |
def markdownify(text): | |
text = text.replace('\\r', '').split('\\n') | |
out = [] | |
in_list = False | |
in_quote = False | |
in_code = 0 | |
for line in text: | |
# Hande code blocks | |
if in_code: | |
if line.startswith('///'): | |
if in_code == 2: | |
out.append(r'{% endhighlight %}') | |
in_code = 0 | |
continue | |
if in_code == 1: | |
if line.startswith('['): | |
out.append('{%% highlight %s %%}' % pygment_lang(line[1:-1])) | |
in_code = 2 | |
continue | |
else: | |
in_code = 3 | |
if in_code == 3: | |
out.append(' ' + line) | |
else: | |
out.append(line) | |
continue | |
elif line == '///html': | |
in_code = 4 | |
continue | |
elif line.startswith('///'): | |
out.append('') | |
in_code = 1 | |
continue | |
# Handle lists | |
if line.startswith('*') or line.startswith('#'): | |
if not in_list: | |
out.append('') | |
in_list = True | |
if line.startswith('***'): | |
out.append(' * ' + markdownify_line(line[3:])) | |
elif line.startswith('**'): | |
out.append(' * ' + markdownify_line(line[2:])) | |
elif line.startswith('*'): | |
out.append('* ' + markdownify_line(line[1:])) | |
elif line.startswith('#'): | |
out.append('1 ' + markdownify_line(line[1:])) | |
continue | |
elif not line.startswith(' '): | |
in_list = False | |
# Handle quotes | |
if line.startswith('>'): | |
if not in_quote: | |
out.append('') | |
in_quote = True | |
out.append('> ' + line[1:].rstrip()) | |
continue | |
else: | |
in_quote = False | |
# Handle everything else | |
if line.startswith('!!!!'): | |
line = markdownify_line(line[4:]) | |
out.append(line) | |
out.append(len(line) * '=') | |
elif line.startswith('!!!'): | |
line = markdownify_line(line[3:]) | |
out.append(line) | |
out.append(len(line) * '-') | |
else: | |
out.append(markdownify_line(line)) | |
return '\n'.join(out) | |
def cdata_escape(text): | |
return '<![CDATA[' + text.replace(']]>', ']]]]><![CDATA[>').replace(r'\n', '\n') + ']]>' | |
def export_comments(rss, comments, post_id): | |
with open(comments, 'rb') as f: | |
reader = csv.DictReader(f, delimiter=',', escapechar='\\') | |
for comment in reader: | |
if comment['post_id'] != post_id and comment['comment_status'] == '1': | |
continue | |
rss.write('<wp:comment>\n') | |
rss.write('<wp:comment_id>0_' + comment['comment_id'] + '</wp:comment_id>\n') | |
rss.write('<wp:comment_author>' + xml.sax.saxutils.escape(comment['comment_author']) + | |
'</wp:comment_author>\n') | |
rss.write('<wp:comment_author_email>' + comment['comment_email'] + '</wp:comment_author_email>\n') | |
if len(comment['comment_site']): | |
rss.write('<wp:comment_author_url>' + comment['comment_site'] + '</wp:comment_author_url>\n') | |
rss.write('<wp:comment_author_IP>' + comment['comment_ip'] + '</wp:comment_author_IP>\n') | |
rss.write('<wp:comment_date_gmt>' + comment['comment_dt'] + '</wp:comment_date_gmt>\n') | |
rss.write('<wp:comment_content>' + cdata_escape(comment['comment_content']) + '</wp:comment_content>\n') | |
rss.write('<wp:comment_approved>1</wp:comment_approved>\n') | |
rss.write('<wp:comment_parent>0</wp:comment_parent>\n') | |
rss.write('</wp:comment>\n') | |
def export_posts(source, comments, dest, categories): | |
rss = os.path.join(dest, 'rss.xml') | |
dest = os.path.join(dest, '_posts') | |
try: | |
os.mkdir(dest) | |
except: | |
pass | |
with open(rss, 'wb') as rss: | |
rss.write("""<rss version="2.0" | |
xmlns:content="http://purl.org/rss/1.0/modules/content/" | |
xmlns:dsq="http://www.disqus.com/" | |
xmlns:dc="http://purl.org/dc/elements/1.1/" | |
xmlns:wp="http://wordpress.org/export/1.0/" | |
>\n<channel>\n""") | |
with open(source, 'rb') as f: | |
reader = csv.DictReader(f, delimiter=',', escapechar='\\') | |
for post in reader: | |
if post['post_type'] != 'post': | |
continue | |
slug = post['post_url'].split('/')[-1].lower() | |
date = post['post_dt'].split(' ')[0] | |
md = os.path.join(dest, '%s-%s.markdown' % (date, slug)) | |
rss.write('<item>\n') | |
rss.write('<title>' + xml.sax.saxutils.escape(post['post_title']) + '</title>\n') | |
rss.write('<link>http://blog.mymind.fr/blog/' + date.replace('-', '/') + '/' + slug + '/</link>\n') | |
rss.write('<content:encoded>' + cdata_escape(post['post_excerpt_xhtml'] + post['post_content_xhtml']) + | |
'</content:encoded>\n') | |
rss.write('<dsq:thread_identifier>' + date + '-' + slug + '</dsq:thread_identifier>\n') | |
rss.write('<wp:post_date_gmt>' + post['post_dt'] + '</wp:post_date_gmt>\n') | |
rss.write('<wp:comment_status>open</wp:comment_status>\n') | |
export_comments(rss, comments, post['post_id']) | |
rss.write('</item>\n') | |
meta = post['post_meta'] | |
tags = None | |
if len(meta) >= 1: | |
meta = phpserialize.loads(meta) | |
if 'tag' in meta: | |
tags = meta['tag'].values() | |
with open(md, 'w') as of: | |
print >> of, '---' | |
print >> of, 'layout: post' | |
print >> of, 'title: "%s"' % post['post_title'] | |
print >> of, 'date: %s' % post['post_dt'] | |
print >> of, 'comments: true' | |
print >> of, 'categories: %s' % categories[post['cat_id']] | |
if len(post['post_password']) > 0 or post['post_status'] != '1': | |
print >> of, 'published: false' | |
if tags is not None: | |
print >> of, 'tags: [%s]' % ', '.join(tags) | |
print >> of, '---' | |
content = '' | |
if len(post['post_excerpt']): | |
content += markdownify(post['post_excerpt']) | |
content += '\n\n<!-- more -->\n\n' | |
content += markdownify(post['post_content']) | |
print >> of, content | |
rss.write('</channel></rss>') | |
if __name__ == '__main__': | |
dest = split_export(sys.argv[1]) | |
categories = load_categories(os.path.join(dest, 'category.csv')) | |
export_posts(os.path.join(dest, 'post.csv'), | |
os.path.join(dest, 'comment.csv'), | |
dest, categories) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment