Skip to content

Instantly share code, notes, and snippets.

@tiramiseb
Created May 30, 2016 08:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tiramiseb/048db9c21c3fac8a44fd76cddaa17834 to your computer and use it in GitHub Desktop.
Save tiramiseb/048db9c21c3fac8a44fd76cddaa17834 to your computer and use it in GitHub Desktop.
Import WordPress comments to Pelican Comment System
#!/usr/bin/env python3
"""
This script :
* extracts comments from a WordPress eXtended RSS export file
* creates comment files for the Pelican comment system, in a "content/comments"
directory in the current path
The script takes the XML file name as its only argument.
It has only been tested with a single-user wordpress full export
"""
import os
import sys
import xml.etree.ElementTree as ET
ns = {'wp': 'http://wordpress.org/export/1.2/'}
try:
os.mkdir('content')
except FileExistsError:
pass
try:
os.mkdir(os.path.join('content', 'comments'))
except FileExistsError:
pass
tree = ET.parse(sys.argv[1])
root = tree.getroot()
channel = root.find('channel')
for document in channel.findall('item'):
slug = document.find('link').text.split('/')[-2]
comments = []
slugs = {}
idslug = 0
for commentag in document.findall('wp:comment', ns):
id_ = commentag.find('wp:comment_id', ns).text
idslug += 1
slugs[id_] = idslug
parent = slugs.get(commentag.find('wp:comment_parent', ns).text)
author = commentag.find('wp:comment_author', ns).text
# Here change the author name if needed
email = commentag.find('wp:comment_author_email', ns).text
# Here, change some email addresses if needed
url = commentag.find('wp:comment_author_url', ns).text
# Here, change the URL if needed
content = commentag.find('wp:comment_content', ns).text
# Here, change the comments content. The only problems I've found
# are listed here :
# * a strange replacement for the "à" character
# * <em>s that have been replaced with *s
contentlines = content.replace('<em>', '*'
).replace('</em>', '*'
).replace('`s', 'à'
).split('\n')
newcontent = []
incode = False
for l in contentlines:
if '<code>' in l and not incode:
incode = True
newcontent.append('')
newcontent.append('::')
newcontent.append(l.replace('<code>', '\n '))
elif '</code>' in l and incode:
incode = False
newcontent.append(' {}'.format(l.replace('</code>', '\n\n')))
elif incode:
newcontent.append(' {}'.format(l))
else:
newcontent.append(l)
content = '\n'.join(newcontent)
comment = {
'id': str(idslug),
'author': author,
'email': email,
'url': url,
'date': commentag.find('wp:comment_date', ns).text,
'content': content,
'parent': parent
}
comments.append(comment)
if comments:
try:
os.mkdir(os.path.join('content', 'comments', slug))
except FileExistsError:
pass
for comment in comments:
with open(os.path.join(
'content',
'comments',
slug,
'{}.rst'.format(comment['id'])
), 'w') as commentfile:
commentfile.write(':author: {}\n'.format(comment['author']))
commentfile.write(':email: {}\n'.format(comment['email']))
commentfile.write(':date: {}\n'.format(comment['date']))
if comment['url']:
commentfile.write(':authorurl: {}\n'.format(comment['url']))
if comment['parent']:
commentfile.write(':replyto: {}rst\n'.format(
comment['parent']))
commentfile.write('\n')
commentfile.write(comment['content'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment