tiramiseb/extract_wp_comments_for_pelican.py

## extract_wp_comments_for_pelican.py
#!/usr/bin/env python3
"""
This script :

* extracts comments from a WordPress eXtended RSS export file
* creates comment files for the Pelican comment system, in a "content/comments"
  directory in the current path

The script takes the XML file name as its only argument.

It has only been tested with a single-user wordpress full export
"""

import os
import sys
import xml.etree.ElementTree as ET

ns = {'wp': 'http://wordpress.org/export/1.2/'}

try:
    os.mkdir('content')
except FileExistsError:
    pass
try:
    os.mkdir(os.path.join('content', 'comments'))
except FileExistsError:
    pass

tree = ET.parse(sys.argv[1])
root = tree.getroot()
channel = root.find('channel')
for document in channel.findall('item'):
    slug = document.find('link').text.split('/')[-2]
    comments = []
    slugs = {}
    idslug = 0
    for commentag in document.findall('wp:comment', ns):
        id_ = commentag.find('wp:comment_id', ns).text
        idslug += 1
        slugs[id_] = idslug
        parent = slugs.get(commentag.find('wp:comment_parent', ns).text)
        author = commentag.find('wp:comment_author', ns).text
        # Here change the author name if needed
        email = commentag.find('wp:comment_author_email', ns).text
        # Here, change some email addresses if needed
        url = commentag.find('wp:comment_author_url', ns).text
        # Here, change the URL if needed
        content = commentag.find('wp:comment_content', ns).text
        # Here, change the comments content. The only problems I've found
        # are listed here :
        # * a strange replacement for the "à" character
        # * <em>s that have been replaced with *s
        contentlines = content.replace('<em>', '*'
                                ).replace('</em>', '*'
                                ).replace('`s', 'à'
                                ).split('\n')
        newcontent = []
        incode = False
        for l in contentlines:
            if '<code>' in l and not incode:
                incode = True
                newcontent.append('')
                newcontent.append('::')
                newcontent.append(l.replace('<code>', '\n    '))
            elif '</code>' in l and incode:
                incode = False
                newcontent.append('    {}'.format(l.replace('</code>', '\n\n')))
            elif incode:
                newcontent.append('    {}'.format(l))
            else:
                newcontent.append(l)
        content = '\n'.join(newcontent)

        comment = {
            'id': str(idslug),
            'author': author,
            'email': email,
            'url': url,
            'date': commentag.find('wp:comment_date', ns).text,
            'content': content,
            'parent': parent
        }
        comments.append(comment)
    if comments:
        try:
            os.mkdir(os.path.join('content', 'comments', slug))
        except FileExistsError:
            pass
        for comment in comments:
            with open(os.path.join(
                        'content',
                        'comments',
                        slug,
                        '{}.rst'.format(comment['id'])
                        ), 'w') as commentfile:
                commentfile.write(':author: {}\n'.format(comment['author']))
                commentfile.write(':email: {}\n'.format(comment['email']))
                commentfile.write(':date: {}\n'.format(comment['date']))
                if comment['url']:
                    commentfile.write(':authorurl: {}\n'.format(comment['url']))
                if comment['parent']:
                    commentfile.write(':replyto: {}rst\n'.format(
                                                            comment['parent']))
                commentfile.write('\n')
                commentfile.write(comment['content'])
	#!/usr/bin/env python3
	"""
	This script :

	* extracts comments from a WordPress eXtended RSS export file
	* creates comment files for the Pelican comment system, in a "content/comments"
	directory in the current path

	The script takes the XML file name as its only argument.

	It has only been tested with a single-user wordpress full export
	"""

	import os
	import sys
	import xml.etree.ElementTree as ET

	ns = {'wp': 'http://wordpress.org/export/1.2/'}

	try:
	os.mkdir('content')
	except FileExistsError:
	pass
	try:
	os.mkdir(os.path.join('content', 'comments'))
	except FileExistsError:
	pass

	tree = ET.parse(sys.argv[1])
	root = tree.getroot()
	channel = root.find('channel')
	for document in channel.findall('item'):
	slug = document.find('link').text.split('/')[-2]
	comments = []
	slugs = {}
	idslug = 0
	for commentag in document.findall('wp:comment', ns):
	id_ = commentag.find('wp:comment_id', ns).text
	idslug += 1
	slugs[id_] = idslug
	parent = slugs.get(commentag.find('wp:comment_parent', ns).text)
	author = commentag.find('wp:comment_author', ns).text
	# Here change the author name if needed
	email = commentag.find('wp:comment_author_email', ns).text
	# Here, change some email addresses if needed
	url = commentag.find('wp:comment_author_url', ns).text
	# Here, change the URL if needed
	content = commentag.find('wp:comment_content', ns).text
	# Here, change the comments content. The only problems I've found
	# are listed here :
	# * a strange replacement for the "à" character
	# * <em>s that have been replaced with *s
	contentlines = content.replace('<em>', '*'
	).replace('</em>', '*'
	).replace('`s', 'à'
	).split('\n')
	newcontent = []
	incode = False
	for l in contentlines:
	if '<code>' in l and not incode:
	incode = True
	newcontent.append('')
	newcontent.append('::')
	newcontent.append(l.replace('<code>', '\n '))
	elif '</code>' in l and incode:
	incode = False
	newcontent.append(' {}'.format(l.replace('</code>', '\n\n')))
	elif incode:
	newcontent.append(' {}'.format(l))
	else:
	newcontent.append(l)
	content = '\n'.join(newcontent)

	comment = {
	'id': str(idslug),
	'author': author,
	'email': email,
	'url': url,
	'date': commentag.find('wp:comment_date', ns).text,
	'content': content,
	'parent': parent
	}
	comments.append(comment)
	if comments:
	try:
	os.mkdir(os.path.join('content', 'comments', slug))
	except FileExistsError:
	pass
	for comment in comments:
	with open(os.path.join(
	'content',
	'comments',
	slug,
	'{}.rst'.format(comment['id'])
	), 'w') as commentfile:
	commentfile.write(':author: {}\n'.format(comment['author']))
	commentfile.write(':email: {}\n'.format(comment['email']))
	commentfile.write(':date: {}\n'.format(comment['date']))
	if comment['url']:
	commentfile.write(':authorurl: {}\n'.format(comment['url']))
	if comment['parent']:
	commentfile.write(':replyto: {}rst\n'.format(
	comment['parent']))
	commentfile.write('\n')
	commentfile.write(comment['content'])