Fruneau/dotclear-octopress.py

## dotclear-octopress.py
import sys
import csv
import os
import re
import phpserialize
import xml.sax.saxutils

def split_export(source):
    dest   = source + '.d'

    try:
        os.mkdir(dest)
    except:
        pass

    with open(source, 'r') as f:
        out = None
        for line in f.xreadlines():
            if len(line) <= 1 or line.startswith('///'):
                if out is not None:
                    out.close()
                out = None
                continue
            elif line[0] == '[':
                space = line.find(' ')
                table = line[1:space]
                out   = open(os.path.join(dest, table + '.csv'), 'w')
                out.write(line[space + 1:-2] + '\n')
            elif out is not None:
                out.write(line.replace('\\n', '\\\\n').replace('\\r', '\\\\r'))
    return dest

def load_categories(source):
    categories = {}
    with open(source, 'rb') as f:
        reader = csv.DictReader(f, delimiter=',', escapechar='\\')
        for line in reader:
            categories[line['cat_id']] = line['cat_title']
    return categories

url_RE = re.compile(r'\[([^|\]]+)\|([^\]]+)\]')
imglink_RE = re.compile(r'\[\(\([^|]+\|([^|]+)(?:\|[^)]+)?\)\)\|(?:/mind)?/public/([^\]]+)\]')
img_RE = re.compile(r'\(\((?:/mind)?/public/([^|]+)\|([^|]+)(?:\|[^)]+)?\)\)')
exp_RE = re.compile(r'\^\^(.*?)\^\^')

def markdownify_line(line):
    line = line.replace("''", "_").replace(r'%%%', '  ').replace('@@', '`')
    line = exp_RE.sub(r'<sup>\1</sup>', line)
    line = imglink_RE.sub(r'![\1]({{ site.url }}/assets/\2)', line)
    line = img_RE.sub(r'![\2]({{ site.url }}/assets/\1)', line)
    line = url_RE.sub(r'[\1](\2)', line)
    return line

def pygment_lang(name):
    if name == 'c_mac':
        return 'objective-c'
    elif name == 'html4strict':
        return 'html'
    return name

def markdownify(text):
    text = text.replace('\\r', '').split('\\n')
    out  = []
    in_list = False
    in_quote = False
    in_code = 0
    for line in text:
        # Hande code blocks
        if in_code:
            if line.startswith('///'):
                if in_code == 2:
                    out.append(r'{% endhighlight %}')
                in_code = 0
                continue
            if in_code == 1:
                if line.startswith('['):
                    out.append('{%% highlight %s %%}' % pygment_lang(line[1:-1]))
                    in_code = 2
                    continue
                else:
                    in_code = 3
            if in_code == 3:
                out.append('    ' + line)
            else:
                out.append(line)
            continue
        elif line == '///html':
            in_code = 4
            continue
        elif line.startswith('///'):
            out.append('')
            in_code = 1
            continue


        # Handle lists
        if line.startswith('*') or line.startswith('#'):
            if not in_list:
                out.append('')
                in_list = True
            if line.startswith('***'):
                out.append('        *  ' + markdownify_line(line[3:]))
            elif line.startswith('**'):
                out.append('    *  ' + markdownify_line(line[2:]))
            elif line.startswith('*'):
                out.append('*  ' + markdownify_line(line[1:]))
            elif line.startswith('#'):
                out.append('1  ' + markdownify_line(line[1:]))
            continue
        elif not line.startswith(' '):
            in_list = False


        # Handle quotes
        if line.startswith('>'):
            if not in_quote:
                out.append('')
                in_quote = True
            out.append('> ' + line[1:].rstrip())
            continue
        else:
            in_quote = False

        # Handle everything else
        if line.startswith('!!!!'):
            line = markdownify_line(line[4:])
            out.append(line)
            out.append(len(line) * '=')
        elif line.startswith('!!!'):
            line = markdownify_line(line[3:])
            out.append(line)
            out.append(len(line) * '-')
        else:
            out.append(markdownify_line(line))
    return '\n'.join(out)

def cdata_escape(text):
    return '<![CDATA[' + text.replace(']]>', ']]]]><![CDATA[>').replace(r'\n', '\n') + ']]>'

def export_comments(rss, comments, post_id):
    with open(comments, 'rb') as f:
        reader = csv.DictReader(f, delimiter=',', escapechar='\\')

        for comment in reader:
            if comment['post_id'] != post_id and comment['comment_status'] == '1':
                continue

            rss.write('<wp:comment>\n')
            rss.write('<wp:comment_id>0_' + comment['comment_id'] + '</wp:comment_id>\n')
            rss.write('<wp:comment_author>' + xml.sax.saxutils.escape(comment['comment_author']) +
                '</wp:comment_author>\n')
            rss.write('<wp:comment_author_email>' + comment['comment_email'] + '</wp:comment_author_email>\n')
            if len(comment['comment_site']):
                rss.write('<wp:comment_author_url>' + comment['comment_site'] + '</wp:comment_author_url>\n')
            rss.write('<wp:comment_author_IP>' + comment['comment_ip'] + '</wp:comment_author_IP>\n')
            rss.write('<wp:comment_date_gmt>' + comment['comment_dt'] + '</wp:comment_date_gmt>\n')
            rss.write('<wp:comment_content>' + cdata_escape(comment['comment_content']) + '</wp:comment_content>\n')
            rss.write('<wp:comment_approved>1</wp:comment_approved>\n')
            rss.write('<wp:comment_parent>0</wp:comment_parent>\n')
            rss.write('</wp:comment>\n')

def export_posts(source, comments, dest, categories):
    rss  = os.path.join(dest, 'rss.xml')
    dest = os.path.join(dest, '_posts')
    try:
        os.mkdir(dest)
    except:
        pass

    with open(rss, 'wb') as rss:
        rss.write("""<rss version="2.0"
  xmlns:content="http://purl.org/rss/1.0/modules/content/"
  xmlns:dsq="http://www.disqus.com/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
  xmlns:wp="http://wordpress.org/export/1.0/"
>\n<channel>\n""")

        with open(source, 'rb') as f:
            reader = csv.DictReader(f, delimiter=',', escapechar='\\')
            for post in reader:
                if post['post_type'] != 'post':
                    continue
                slug = post['post_url'].split('/')[-1].lower()
                date = post['post_dt'].split(' ')[0]
                md   = os.path.join(dest, '%s-%s.markdown' % (date, slug))

                rss.write('<item>\n')
                rss.write('<title>' + xml.sax.saxutils.escape(post['post_title']) + '</title>\n')
                rss.write('<link>http://blog.mymind.fr/blog/' + date.replace('-', '/') + '/' + slug + '/</link>\n')
                rss.write('<content:encoded>' + cdata_escape(post['post_excerpt_xhtml'] + post['post_content_xhtml']) +
                        '</content:encoded>\n')
                rss.write('<dsq:thread_identifier>' + date + '-' + slug + '</dsq:thread_identifier>\n')
                rss.write('<wp:post_date_gmt>' + post['post_dt'] + '</wp:post_date_gmt>\n')
                rss.write('<wp:comment_status>open</wp:comment_status>\n')

                export_comments(rss, comments, post['post_id'])

                rss.write('</item>\n')

                meta = post['post_meta']
                tags = None
                if len(meta) >= 1:
                    meta = phpserialize.loads(meta)
                    if 'tag' in meta:
                        tags = meta['tag'].values()
                with open(md, 'w') as of:
                    print >> of, '---'
                    print >> of, 'layout: post'
                    print >> of, 'title: "%s"' % post['post_title']
                    print >> of, 'date: %s' % post['post_dt']
                    print >> of, 'comments: true'
                    print >> of, 'categories: %s' % categories[post['cat_id']]
                    if len(post['post_password']) > 0 or post['post_status'] != '1':
                        print >> of, 'published: false'
                    if tags is not None:
                        print >> of, 'tags: [%s]' % ', '.join(tags)
                    print >> of, '---'

                    content = ''
                    if len(post['post_excerpt']):
                        content += markdownify(post['post_excerpt'])
                        content += '\n\n<!-- more -->\n\n'
                    content += markdownify(post['post_content'])
                    print >> of, content
        rss.write('</channel></rss>')

if __name__ == '__main__':
    dest = split_export(sys.argv[1])

    categories = load_categories(os.path.join(dest, 'category.csv'))

    export_posts(os.path.join(dest, 'post.csv'),
                 os.path.join(dest, 'comment.csv'),
                 dest, categories)
	import sys
	import csv
	import os
	import re
	import phpserialize
	import xml.sax.saxutils

	def split_export(source):
	dest = source + '.d'

	try:
	os.mkdir(dest)
	except:
	pass

	with open(source, 'r') as f:
	out = None
	for line in f.xreadlines():
	if len(line) <= 1 or line.startswith('///'):
	if out is not None:
	out.close()
	out = None
	continue
	elif line[0] == '[':
	space = line.find(' ')
	table = line[1:space]
	out = open(os.path.join(dest, table + '.csv'), 'w')
	out.write(line[space + 1:-2] + '\n')
	elif out is not None:
	out.write(line.replace('\\n', '\\\\n').replace('\\r', '\\\\r'))
	return dest

	def load_categories(source):
	categories = {}
	with open(source, 'rb') as f:
	reader = csv.DictReader(f, delimiter=',', escapechar='\\')
	for line in reader:
	categories[line['cat_id']] = line['cat_title']
	return categories

	url_RE = re.compile(r'\[([^\|\]]+)\\|([^\]]+)\]')
	imglink_RE = re.compile(r'\[\(\([^\|]+\\|([^\|]+)(?:\\|[^)]+)?\)\)\\|(?:/mind)?/public/([^\]]+)\]')
	img_RE = re.compile(r'\(\((?:/mind)?/public/([^\|]+)\\|([^\|]+)(?:\\|[^)]+)?\)\)')
	exp_RE = re.compile(r'\^\^(.*?)\^\^')

	def markdownify_line(line):
	line = line.replace("''", "_").replace(r'%%%', ' ').replace('@@', '`')
	line = exp_RE.sub(r'<sup>\1</sup>', line)
	line = imglink_RE.sub(r'![\1]({{ site.url }}/assets/\2)', line)
	line = img_RE.sub(r'![\2]({{ site.url }}/assets/\1)', line)
	line = url_RE.sub(r'[\1](\2)', line)
	return line

	def pygment_lang(name):
	if name == 'c_mac':
	return 'objective-c'
	elif name == 'html4strict':
	return 'html'
	return name

	def markdownify(text):
	text = text.replace('\\r', '').split('\\n')
	out = []
	in_list = False
	in_quote = False
	in_code = 0
	for line in text:
	# Hande code blocks
	if in_code:
	if line.startswith('///'):
	if in_code == 2:
	out.append(r'{% endhighlight %}')
	in_code = 0
	continue
	if in_code == 1:
	if line.startswith('['):
	out.append('{%% highlight %s %%}' % pygment_lang(line[1:-1]))
	in_code = 2
	continue
	else:
	in_code = 3
	if in_code == 3:
	out.append(' ' + line)
	else:
	out.append(line)
	continue
	elif line == '///html':
	in_code = 4
	continue
	elif line.startswith('///'):
	out.append('')
	in_code = 1
	continue


	# Handle lists
	if line.startswith('*') or line.startswith('#'):
	if not in_list:
	out.append('')
	in_list = True
	if line.startswith('***'):
	out.append(' * ' + markdownify_line(line[3:]))
	elif line.startswith('**'):
	out.append(' * ' + markdownify_line(line[2:]))
	elif line.startswith('*'):
	out.append('* ' + markdownify_line(line[1:]))
	elif line.startswith('#'):
	out.append('1 ' + markdownify_line(line[1:]))
	continue
	elif not line.startswith(' '):
	in_list = False


	# Handle quotes
	if line.startswith('>'):
	if not in_quote:
	out.append('')
	in_quote = True
	out.append('> ' + line[1:].rstrip())
	continue
	else:
	in_quote = False

	# Handle everything else
	if line.startswith('!!!!'):
	line = markdownify_line(line[4:])
	out.append(line)
	out.append(len(line) * '=')
	elif line.startswith('!!!'):
	line = markdownify_line(line[3:])
	out.append(line)
	out.append(len(line) * '-')
	else:
	out.append(markdownify_line(line))
	return '\n'.join(out)

	def cdata_escape(text):
	return '<![CDATA[' + text.replace(']]>', ']]]]><![CDATA[>').replace(r'\n', '\n') + ']]>'

	def export_comments(rss, comments, post_id):
	with open(comments, 'rb') as f:
	reader = csv.DictReader(f, delimiter=',', escapechar='\\')

	for comment in reader:
	if comment['post_id'] != post_id and comment['comment_status'] == '1':
	continue

	rss.write('<wp:comment>\n')
	rss.write('<wp:comment_id>0_' + comment['comment_id'] + '</wp:comment_id>\n')
	rss.write('<wp:comment_author>' + xml.sax.saxutils.escape(comment['comment_author']) +
	'</wp:comment_author>\n')
	rss.write('<wp:comment_author_email>' + comment['comment_email'] + '</wp:comment_author_email>\n')
	if len(comment['comment_site']):
	rss.write('<wp:comment_author_url>' + comment['comment_site'] + '</wp:comment_author_url>\n')
	rss.write('<wp:comment_author_IP>' + comment['comment_ip'] + '</wp:comment_author_IP>\n')
	rss.write('<wp:comment_date_gmt>' + comment['comment_dt'] + '</wp:comment_date_gmt>\n')
	rss.write('<wp:comment_content>' + cdata_escape(comment['comment_content']) + '</wp:comment_content>\n')
	rss.write('<wp:comment_approved>1</wp:comment_approved>\n')
	rss.write('<wp:comment_parent>0</wp:comment_parent>\n')
	rss.write('</wp:comment>\n')

	def export_posts(source, comments, dest, categories):
	rss = os.path.join(dest, 'rss.xml')
	dest = os.path.join(dest, '_posts')
	try:
	os.mkdir(dest)
	except:
	pass

	with open(rss, 'wb') as rss:
	rss.write("""<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:dsq="http://www.disqus.com/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.0/"
	>\n<channel>\n""")

	with open(source, 'rb') as f:
	reader = csv.DictReader(f, delimiter=',', escapechar='\\')
	for post in reader:
	if post['post_type'] != 'post':
	continue
	slug = post['post_url'].split('/')[-1].lower()
	date = post['post_dt'].split(' ')[0]
	md = os.path.join(dest, '%s-%s.markdown' % (date, slug))

	rss.write('<item>\n')
	rss.write('<title>' + xml.sax.saxutils.escape(post['post_title']) + '</title>\n')
	rss.write('<link>http://blog.mymind.fr/blog/' + date.replace('-', '/') + '/' + slug + '/</link>\n')
	rss.write('<content:encoded>' + cdata_escape(post['post_excerpt_xhtml'] + post['post_content_xhtml']) +
	'</content:encoded>\n')
	rss.write('<dsq:thread_identifier>' + date + '-' + slug + '</dsq:thread_identifier>\n')
	rss.write('<wp:post_date_gmt>' + post['post_dt'] + '</wp:post_date_gmt>\n')
	rss.write('<wp:comment_status>open</wp:comment_status>\n')

	export_comments(rss, comments, post['post_id'])

	rss.write('</item>\n')

	meta = post['post_meta']
	tags = None
	if len(meta) >= 1:
	meta = phpserialize.loads(meta)
	if 'tag' in meta:
	tags = meta['tag'].values()
	with open(md, 'w') as of:
	print >> of, '---'
	print >> of, 'layout: post'
	print >> of, 'title: "%s"' % post['post_title']
	print >> of, 'date: %s' % post['post_dt']
	print >> of, 'comments: true'
	print >> of, 'categories: %s' % categories[post['cat_id']]
	if len(post['post_password']) > 0 or post['post_status'] != '1':
	print >> of, 'published: false'
	if tags is not None:
	print >> of, 'tags: [%s]' % ', '.join(tags)
	print >> of, '---'

	content = ''
	if len(post['post_excerpt']):
	content += markdownify(post['post_excerpt'])
	content += '\n\n<!-- more -->\n\n'
	content += markdownify(post['post_content'])
	print >> of, content
	rss.write('</channel></rss>')

	if __name__ == '__main__':
	dest = split_export(sys.argv[1])

	categories = load_categories(os.path.join(dest, 'category.csv'))

	export_posts(os.path.join(dest, 'post.csv'),
	os.path.join(dest, 'comment.csv'),
	dest, categories)