Skip to content

Instantly share code, notes, and snippets.

@Fruneau
Last active January 1, 2016 17:08
Show Gist options
  • Save Fruneau/8174826 to your computer and use it in GitHub Desktop.
Save Fruneau/8174826 to your computer and use it in GitHub Desktop.
Script to migrate blog posts from Dotclear to Octopress and build rss.xml file to import comments on Disqus
import sys
import csv
import os
import re
import phpserialize
import xml.sax.saxutils
def split_export(source):
dest = source + '.d'
try:
os.mkdir(dest)
except:
pass
with open(source, 'r') as f:
out = None
for line in f.xreadlines():
if len(line) <= 1 or line.startswith('///'):
if out is not None:
out.close()
out = None
continue
elif line[0] == '[':
space = line.find(' ')
table = line[1:space]
out = open(os.path.join(dest, table + '.csv'), 'w')
out.write(line[space + 1:-2] + '\n')
elif out is not None:
out.write(line.replace('\\n', '\\\\n').replace('\\r', '\\\\r'))
return dest
def load_categories(source):
categories = {}
with open(source, 'rb') as f:
reader = csv.DictReader(f, delimiter=',', escapechar='\\')
for line in reader:
categories[line['cat_id']] = line['cat_title']
return categories
url_RE = re.compile(r'\[([^|\]]+)\|([^\]]+)\]')
imglink_RE = re.compile(r'\[\(\([^|]+\|([^|]+)(?:\|[^)]+)?\)\)\|(?:/mind)?/public/([^\]]+)\]')
img_RE = re.compile(r'\(\((?:/mind)?/public/([^|]+)\|([^|]+)(?:\|[^)]+)?\)\)')
exp_RE = re.compile(r'\^\^(.*?)\^\^')
def markdownify_line(line):
line = line.replace("''", "_").replace(r'%%%', ' ').replace('@@', '`')
line = exp_RE.sub(r'<sup>\1</sup>', line)
line = imglink_RE.sub(r'![\1]({{ site.url }}/assets/\2)', line)
line = img_RE.sub(r'![\2]({{ site.url }}/assets/\1)', line)
line = url_RE.sub(r'[\1](\2)', line)
return line
def pygment_lang(name):
if name == 'c_mac':
return 'objective-c'
elif name == 'html4strict':
return 'html'
return name
def markdownify(text):
text = text.replace('\\r', '').split('\\n')
out = []
in_list = False
in_quote = False
in_code = 0
for line in text:
# Hande code blocks
if in_code:
if line.startswith('///'):
if in_code == 2:
out.append(r'{% endhighlight %}')
in_code = 0
continue
if in_code == 1:
if line.startswith('['):
out.append('{%% highlight %s %%}' % pygment_lang(line[1:-1]))
in_code = 2
continue
else:
in_code = 3
if in_code == 3:
out.append(' ' + line)
else:
out.append(line)
continue
elif line == '///html':
in_code = 4
continue
elif line.startswith('///'):
out.append('')
in_code = 1
continue
# Handle lists
if line.startswith('*') or line.startswith('#'):
if not in_list:
out.append('')
in_list = True
if line.startswith('***'):
out.append(' * ' + markdownify_line(line[3:]))
elif line.startswith('**'):
out.append(' * ' + markdownify_line(line[2:]))
elif line.startswith('*'):
out.append('* ' + markdownify_line(line[1:]))
elif line.startswith('#'):
out.append('1 ' + markdownify_line(line[1:]))
continue
elif not line.startswith(' '):
in_list = False
# Handle quotes
if line.startswith('>'):
if not in_quote:
out.append('')
in_quote = True
out.append('> ' + line[1:].rstrip())
continue
else:
in_quote = False
# Handle everything else
if line.startswith('!!!!'):
line = markdownify_line(line[4:])
out.append(line)
out.append(len(line) * '=')
elif line.startswith('!!!'):
line = markdownify_line(line[3:])
out.append(line)
out.append(len(line) * '-')
else:
out.append(markdownify_line(line))
return '\n'.join(out)
def cdata_escape(text):
return '<![CDATA[' + text.replace(']]>', ']]]]><![CDATA[>').replace(r'\n', '\n') + ']]>'
def export_comments(rss, comments, post_id):
with open(comments, 'rb') as f:
reader = csv.DictReader(f, delimiter=',', escapechar='\\')
for comment in reader:
if comment['post_id'] != post_id and comment['comment_status'] == '1':
continue
rss.write('<wp:comment>\n')
rss.write('<wp:comment_id>0_' + comment['comment_id'] + '</wp:comment_id>\n')
rss.write('<wp:comment_author>' + xml.sax.saxutils.escape(comment['comment_author']) +
'</wp:comment_author>\n')
rss.write('<wp:comment_author_email>' + comment['comment_email'] + '</wp:comment_author_email>\n')
if len(comment['comment_site']):
rss.write('<wp:comment_author_url>' + comment['comment_site'] + '</wp:comment_author_url>\n')
rss.write('<wp:comment_author_IP>' + comment['comment_ip'] + '</wp:comment_author_IP>\n')
rss.write('<wp:comment_date_gmt>' + comment['comment_dt'] + '</wp:comment_date_gmt>\n')
rss.write('<wp:comment_content>' + cdata_escape(comment['comment_content']) + '</wp:comment_content>\n')
rss.write('<wp:comment_approved>1</wp:comment_approved>\n')
rss.write('<wp:comment_parent>0</wp:comment_parent>\n')
rss.write('</wp:comment>\n')
def export_posts(source, comments, dest, categories):
rss = os.path.join(dest, 'rss.xml')
dest = os.path.join(dest, '_posts')
try:
os.mkdir(dest)
except:
pass
with open(rss, 'wb') as rss:
rss.write("""<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:dsq="http://www.disqus.com/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.0/"
>\n<channel>\n""")
with open(source, 'rb') as f:
reader = csv.DictReader(f, delimiter=',', escapechar='\\')
for post in reader:
if post['post_type'] != 'post':
continue
slug = post['post_url'].split('/')[-1].lower()
date = post['post_dt'].split(' ')[0]
md = os.path.join(dest, '%s-%s.markdown' % (date, slug))
rss.write('<item>\n')
rss.write('<title>' + xml.sax.saxutils.escape(post['post_title']) + '</title>\n')
rss.write('<link>http://blog.mymind.fr/blog/' + date.replace('-', '/') + '/' + slug + '/</link>\n')
rss.write('<content:encoded>' + cdata_escape(post['post_excerpt_xhtml'] + post['post_content_xhtml']) +
'</content:encoded>\n')
rss.write('<dsq:thread_identifier>' + date + '-' + slug + '</dsq:thread_identifier>\n')
rss.write('<wp:post_date_gmt>' + post['post_dt'] + '</wp:post_date_gmt>\n')
rss.write('<wp:comment_status>open</wp:comment_status>\n')
export_comments(rss, comments, post['post_id'])
rss.write('</item>\n')
meta = post['post_meta']
tags = None
if len(meta) >= 1:
meta = phpserialize.loads(meta)
if 'tag' in meta:
tags = meta['tag'].values()
with open(md, 'w') as of:
print >> of, '---'
print >> of, 'layout: post'
print >> of, 'title: "%s"' % post['post_title']
print >> of, 'date: %s' % post['post_dt']
print >> of, 'comments: true'
print >> of, 'categories: %s' % categories[post['cat_id']]
if len(post['post_password']) > 0 or post['post_status'] != '1':
print >> of, 'published: false'
if tags is not None:
print >> of, 'tags: [%s]' % ', '.join(tags)
print >> of, '---'
content = ''
if len(post['post_excerpt']):
content += markdownify(post['post_excerpt'])
content += '\n\n<!-- more -->\n\n'
content += markdownify(post['post_content'])
print >> of, content
rss.write('</channel></rss>')
if __name__ == '__main__':
dest = split_export(sys.argv[1])
categories = load_categories(os.path.join(dest, 'category.csv'))
export_posts(os.path.join(dest, 'post.csv'),
os.path.join(dest, 'comment.csv'),
dest, categories)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment