Last active
January 1, 2016 17:08
-
-
Save Fruneau/8174826 to your computer and use it in GitHub Desktop.
Script to migrate blog posts from Dotclear to Octopress and build rss.xml file to import comments on Disqus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import csv | |
import os | |
import re | |
import phpserialize | |
import xml.sax.saxutils | |
def split_export(source): | |
dest = source + '.d' | |
try: | |
os.mkdir(dest) | |
except: | |
pass | |
with open(source, 'r') as f: | |
out = None | |
for line in f.xreadlines(): | |
if len(line) <= 1 or line.startswith('///'): | |
if out is not None: | |
out.close() | |
out = None | |
continue | |
elif line[0] == '[': | |
space = line.find(' ') | |
table = line[1:space] | |
out = open(os.path.join(dest, table + '.csv'), 'w') | |
out.write(line[space + 1:-2] + '\n') | |
elif out is not None: | |
out.write(line.replace('\\n', '\\\\n').replace('\\r', '\\\\r')) | |
return dest | |
def load_categories(source): | |
categories = {} | |
with open(source, 'rb') as f: | |
reader = csv.DictReader(f, delimiter=',', escapechar='\\') | |
for line in reader: | |
categories[line['cat_id']] = line['cat_title'] | |
return categories | |
url_RE = re.compile(r'\[([^|\]]+)\|([^\]]+)\]') | |
imglink_RE = re.compile(r'\[\(\([^|]+\|([^|]+)(?:\|[^)]+)?\)\)\|(?:/mind)?/public/([^\]]+)\]') | |
img_RE = re.compile(r'\(\((?:/mind)?/public/([^|]+)\|([^|]+)(?:\|[^)]+)?\)\)') | |
exp_RE = re.compile(r'\^\^(.*?)\^\^') | |
def markdownify_line(line): | |
line = line.replace("''", "_").replace(r'%%%', ' ').replace('@@', '`') | |
line = exp_RE.sub(r'<sup>\1</sup>', line) | |
line = imglink_RE.sub(r'![\1]({{ site.url }}/assets/\2)', line) | |
line = img_RE.sub(r'![\2]({{ site.url }}/assets/\1)', line) | |
line = url_RE.sub(r'[\1](\2)', line) | |
return line | |
def pygment_lang(name): | |
if name == 'c_mac': | |
return 'objective-c' | |
elif name == 'html4strict': | |
return 'html' | |
return name | |
def markdownify(text): | |
text = text.replace('\\r', '').split('\\n') | |
out = [] | |
in_list = False | |
in_quote = False | |
in_code = 0 | |
for line in text: | |
# Hande code blocks | |
if in_code: | |
if line.startswith('///'): | |
if in_code == 2: | |
out.append(r'{% endhighlight %}') | |
in_code = 0 | |
continue | |
if in_code == 1: | |
if line.startswith('['): | |
out.append('{%% highlight %s %%}' % pygment_lang(line[1:-1])) | |
in_code = 2 | |
continue | |
else: | |
in_code = 3 | |
if in_code == 3: | |
out.append(' ' + line) | |
else: | |
out.append(line) | |
continue | |
elif line == '///html': | |
in_code = 4 | |
continue | |
elif line.startswith('///'): | |
out.append('') | |
in_code = 1 | |
continue | |
# Handle lists | |
if line.startswith('*') or line.startswith('#'): | |
if not in_list: | |
out.append('') | |
in_list = True | |
if line.startswith('***'): | |
out.append(' * ' + markdownify_line(line[3:])) | |
elif line.startswith('**'): | |
out.append(' * ' + markdownify_line(line[2:])) | |
elif line.startswith('*'): | |
out.append('* ' + markdownify_line(line[1:])) | |
elif line.startswith('#'): | |
out.append('1 ' + markdownify_line(line[1:])) | |
continue | |
elif not line.startswith(' '): | |
in_list = False | |
# Handle quotes | |
if line.startswith('>'): | |
if not in_quote: | |
out.append('') | |
in_quote = True | |
out.append('> ' + line[1:].rstrip()) | |
continue | |
else: | |
in_quote = False | |
# Handle everything else | |
if line.startswith('!!!!'): | |
line = markdownify_line(line[4:]) | |
out.append(line) | |
out.append(len(line) * '=') | |
elif line.startswith('!!!'): | |
line = markdownify_line(line[3:]) | |
out.append(line) | |
out.append(len(line) * '-') | |
else: | |
out.append(markdownify_line(line)) | |
return '\n'.join(out) | |
def cdata_escape(text): | |
return '<![CDATA[' + text.replace(']]>', ']]]]><![CDATA[>').replace(r'\n', '\n') + ']]>' | |
def export_comments(rss, comments, post_id): | |
with open(comments, 'rb') as f: | |
reader = csv.DictReader(f, delimiter=',', escapechar='\\') | |
for comment in reader: | |
if comment['post_id'] != post_id and comment['comment_status'] == '1': | |
continue | |
rss.write('<wp:comment>\n') | |
rss.write('<wp:comment_id>0_' + comment['comment_id'] + '</wp:comment_id>\n') | |
rss.write('<wp:comment_author>' + xml.sax.saxutils.escape(comment['comment_author']) + | |
'</wp:comment_author>\n') | |
rss.write('<wp:comment_author_email>' + comment['comment_email'] + '</wp:comment_author_email>\n') | |
if len(comment['comment_site']): | |
rss.write('<wp:comment_author_url>' + comment['comment_site'] + '</wp:comment_author_url>\n') | |
rss.write('<wp:comment_author_IP>' + comment['comment_ip'] + '</wp:comment_author_IP>\n') | |
rss.write('<wp:comment_date_gmt>' + comment['comment_dt'] + '</wp:comment_date_gmt>\n') | |
rss.write('<wp:comment_content>' + cdata_escape(comment['comment_content']) + '</wp:comment_content>\n') | |
rss.write('<wp:comment_approved>1</wp:comment_approved>\n') | |
rss.write('<wp:comment_parent>0</wp:comment_parent>\n') | |
rss.write('</wp:comment>\n') | |
def export_posts(source, comments, dest, categories): | |
rss = os.path.join(dest, 'rss.xml') | |
dest = os.path.join(dest, '_posts') | |
try: | |
os.mkdir(dest) | |
except: | |
pass | |
with open(rss, 'wb') as rss: | |
rss.write("""<rss version="2.0" | |
xmlns:content="http://purl.org/rss/1.0/modules/content/" | |
xmlns:dsq="http://www.disqus.com/" | |
xmlns:dc="http://purl.org/dc/elements/1.1/" | |
xmlns:wp="http://wordpress.org/export/1.0/" | |
>\n<channel>\n""") | |
with open(source, 'rb') as f: | |
reader = csv.DictReader(f, delimiter=',', escapechar='\\') | |
for post in reader: | |
if post['post_type'] != 'post': | |
continue | |
slug = post['post_url'].split('/')[-1].lower() | |
date = post['post_dt'].split(' ')[0] | |
md = os.path.join(dest, '%s-%s.markdown' % (date, slug)) | |
rss.write('<item>\n') | |
rss.write('<title>' + xml.sax.saxutils.escape(post['post_title']) + '</title>\n') | |
rss.write('<link>http://blog.mymind.fr/blog/' + date.replace('-', '/') + '/' + slug + '/</link>\n') | |
rss.write('<content:encoded>' + cdata_escape(post['post_excerpt_xhtml'] + post['post_content_xhtml']) + | |
'</content:encoded>\n') | |
rss.write('<dsq:thread_identifier>' + date + '-' + slug + '</dsq:thread_identifier>\n') | |
rss.write('<wp:post_date_gmt>' + post['post_dt'] + '</wp:post_date_gmt>\n') | |
rss.write('<wp:comment_status>open</wp:comment_status>\n') | |
export_comments(rss, comments, post['post_id']) | |
rss.write('</item>\n') | |
meta = post['post_meta'] | |
tags = None | |
if len(meta) >= 1: | |
meta = phpserialize.loads(meta) | |
if 'tag' in meta: | |
tags = meta['tag'].values() | |
with open(md, 'w') as of: | |
print >> of, '---' | |
print >> of, 'layout: post' | |
print >> of, 'title: "%s"' % post['post_title'] | |
print >> of, 'date: %s' % post['post_dt'] | |
print >> of, 'comments: true' | |
print >> of, 'categories: %s' % categories[post['cat_id']] | |
if len(post['post_password']) > 0 or post['post_status'] != '1': | |
print >> of, 'published: false' | |
if tags is not None: | |
print >> of, 'tags: [%s]' % ', '.join(tags) | |
print >> of, '---' | |
content = '' | |
if len(post['post_excerpt']): | |
content += markdownify(post['post_excerpt']) | |
content += '\n\n<!-- more -->\n\n' | |
content += markdownify(post['post_content']) | |
print >> of, content | |
rss.write('</channel></rss>') | |
if __name__ == '__main__': | |
dest = split_export(sys.argv[1]) | |
categories = load_categories(os.path.join(dest, 'category.csv')) | |
export_posts(os.path.join(dest, 'post.csv'), | |
os.path.join(dest, 'comment.csv'), | |
dest, categories) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment