Skip to content

Instantly share code, notes, and snippets.

@veeneck
Created May 17, 2018 12:46
Show Gist options
  • Save veeneck/cff15cb978993d40011e6800417ac110 to your computer and use it in GitHub Desktop.
Save veeneck/cff15cb978993d40011e6800417ac110 to your computer and use it in GitHub Desktop.
Import Wordpress comments into Hugo
import xmltodict
import time
import hashlib
import os
import cgi
"""
Desired structure based on my post URLS: /data/comments/2016-07-12-post-slug/comment-uniquehash.yml
Data in comment is structured like this:
_id: bc1e49d0-58fa-11e8-bebb-13146c4ef23d
_parent: 2016-07-12-post-slug
reply_to: ''
name: Ryan
email: 604a2592fdc946151a05706ed0d9978f
body: Hello World
date: '2018-05-16T11:17:35.978Z'
I would recommend commenting out the file system writes and to run this a few times to inspect the data. Then enable them to try a first real run.
Thanks to https://github.com/dancwilliams/wordpress_to_staticman_comments for a starting point.
If you hit errors, make sure you have newlines and that there is no bad data like semi colons or single quotes in your comments that aren't escaped or surrounded by single quotes.
"""
total_count = 0
m = hashlib.md5()
with open('big_data.xml') as fd:
cfg = xmltodict.parse(fd.read())
for item in cfg['rss']['channel']['item']:
print ""
print "Title: " + item['title']
print ""
try:
for comment in item['wp:comment']:
total_count += 1
try:
# Generate random hash for comment file
comment_time = comment['wp:comment_date_gmt']
pattern = '%Y-%m-%d %H:%M:%S'
epoch = int(time.mktime(time.strptime(comment_time, pattern)))
# get Y-m-d for start of my post folder
post_time = item['wp:post_date']
post_time = post_time.split(" ")[0]
# add slug to date to generate directory name, and strip -2 for duplicates in Wordpress export
dirname = post_time + "-" + item["wp:post_name"].rstrip("-2")
# create file name
filename = dirname + "/" + "comment-" + str(epoch) + ".yml"
# create the directory if it doesnt exist
print "opening " + filename
if not os.path.isdir(dirname):
os.mkdir(dirname)
print("creating " + dirname)
# if the comment file already exists (duplicate), just open it in read mode so all writes below fail
if os.path.exists(filename):
fh = open(filename, "r")
else:
fh = open(filename, "w")
fh.write("_parent: " + dirname + "\n")
except Exception as e:
print(e)
pass
try:
print "_id: " + comment['wp:comment_id']
fh.write("_id: " + comment['wp:comment_id'] + "\n")
except:
pass
try:
print "name: " + comment['wp:comment_author']
fh.write("name: " + comment['wp:comment_author'] + "\n")
except:
pass
try:
if comment['wp:comment_parent'] != '0':
print "reply_to: " + comment['wp:comment_parent']
fh.write("reply_to: " + comment['wp:comment_parent'] + "\n")
except:
pass
try:
temp_email = comment['wp:comment_author_email']
temp_email = temp_email.strip()
temp_email = temp_email.lower()
m.update(temp_email)
print "email: " + m.hexdigest()
fh.write("email: " + m.hexdigest() + "\n")
except:
pass
try:
# make it so that comment bodies are surrounded by quotes, and then everything inside the quote is escaped.
comment_body = cgi.escape(comment['wp:comment_content'], True)
print "body: " + comment['wp:comment_content']
fh.write("body: '" + comment_body.replace("'","'") + "'\n")
except Exception as e:
print(e)
pass
try:
print "date: " + comment['wp:comment_date_gmt']
fh.write("date: '" + str(comment['wp:comment_date_gmt']) + "'")
except:
pass
print ""
fh.close
except:
pass
print "Total Count: " + str(total_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment