Skip to content

Instantly share code, notes, and snippets.

Forked from rzhw/wordpressxml2jekyll.rb
Last active June 23, 2023 15:38
Show Gist options
  • Save brianburridge/d28fd59ecd097c140be2 to your computer and use it in GitHub Desktop.
Save brianburridge/d28fd59ecd097c140be2 to your computer and use it in GitHub Desktop.
Convert Wordpress xml export to Jekyll format using Markdown
#!/usr/bin/env ruby
# Input: WordPress XML export file.
# Outputs: a series of Markdown files ready to be included in a Jekyll site,
# and comments.yml which contains all approved comments with metadata which
# can be used for a Disqus import.
# Changes from the original gist:
# 1. Handles titles containing special characters. Those have to be YAML escaped
# 2. Use the original permalinks in wordpress.
require 'rubygems'
require 'hpricot'
require 'html2markdown'
require 'time'
require 'yaml'
require 'fileutils'
require 'uri'
WORDPRESS_XML_FILE_PATH = File.join(File.dirname(__FILE__), ARGV[0])
OUTPUT_PATH = File.join(File.dirname(__FILE__), "/_posts")
class Post
attr_accessor :title, :post_date, :created_at, :slug, :post_id, :content, :markdown_content
attr_accessor :hpricot_element, :permalink, :categories
def initialize(item)
@hpricot_element = item
@title ="title").first.inner_text
@permalink ="link2").inner_text.gsub(/^#{ORIGINAL_DOMAIN}/,'').gsub(/\/$/,"/index.html")
@permalink = URI.unescape(@permalink)
puts "permalink #{@permalink}"
@post_date ="wp:post_date").first.inner_text
@created_at = Date.parse(post_date)
@slug ="wp:post_name").first.inner_text
@categories = []"category").each { |cat|
@categories << cat.inner_text
p @categories
@post_id ="wp:post_id").inner_text
@content ="content:encoded").first.inner_text
text = :contents => content
@markdown_content = text.markdown
def to_jekyll
buf = ""
buf << "---\n"
buf << "layout: post\n"
buf << "title: #{title}\n" #escape character in title
buf << "permalink: #{permalink}\n"
buf << "post_id: #{post_id}\n"
buf << "categories: #{categories.to_yaml.gsub(/^---/,'').chomp}\n"
buf << "---\n\n"
buf << markdown_content
def save(root_path)"#{root_path}/#{created_at}-#{slug}.md", "w") { |file| file.write self.to_jekyll }
def save_comments(path)
comment_elements ="wp:comment").reject do |c|"wp:comment_approved").inner_text != "1"
end"#{path}/comments.yml", "a") do |yaml_file|
comment_elements.collect { |el|, el) }.each { |comment| comment.write_to yaml_file }
class << self
def parse(element, path)
return nil unless element.is_a?(Hpricot::Elem)
post =
puts "saving post: #{post.title}"
class Comment
attr_accessor :author_name, :author_email, :author_url, :content, :post
def initialize(post, element)
@post_id = post.post_id
@post_title = post.title
@author_name ="wp:comment_author").first.inner_text
@author_email ="wp:comment_author_email").first.inner_text
@author_url ="wp:comment_author_url").first.inner_text
@content ="wp:comment_content").first.inner_text || ""
comment_date ="wp:comment_date_gmt").first.inner_text
@created_at = Time.parse("#{comment_date} GMT")
def write_to(file)
file.write self.to_yaml + "\n" unless @content.size == 0
# main
file =,"rb");
contents =
# Hpricot will destroy <link> contents as it expecting a <link href="">
# so we change link to link2 and Hpricot will leave it alone.
contents.gsub!(/link>/, "link2>")
doc = Hpricot(contents)
FileUtils.mkdir_p OUTPUT_PATH"#{OUTPUT_PATH}/comments.yml", "w") { |f| }
(doc / "item").each do |item|
next unless"wp:status").first.inner_text == "publish"
post = Post.parse(item, OUTPUT_PATH)
#!/usr/bin/env ruby
# Takes comments.yml generated by wordpressxml2jekyll.rb and posts them to your Disqus forum.
# uses original WordPress post id as disqus_identifier
# the user key for Disqus must be stored on API-KEY file
# Disqus ruby api
# sudo gem install disqus
require 'rubygems'
require 'disqus'
require 'disqus/api'
require 'yaml'
COMMENTS_YAML_FILE = "#{ENV['PWD']}/export/_posts/comments.yml"
file ="API-KEY", "rb")
apikey =
p "Using API-KEY: #{apikey}"
Disqus::defaults[:api_key] = apikey
forum_id = Disqus::Api.get_forum_list["message"].first["id"]
fak = Disqus::Api.get_forum_api_key(:forum_id => forum_id)["message"] do |yf|
YAML.each_document( yf ) do |c|
post_id = c.ivars["post_id"]
post_title = c.ivars["post_title"]
puts "post_id: #{post_id}"
puts "post_title: #{post_title}"
thread = Disqus::Api.thread_by_identifier(:forum_api_key => fak, :title => post_title, :identifier => post_id)
puts "======= thread_by_identifier result BEGIN ====="
thread.each { |k,v| puts "#{k} => #{v}" }
puts "======= thread_by_identifier result END ====="
thread_id = thread["message"]["thread"]["id"]
p "thread id: #{thread_id}\n\n"
comment = Disqus::Api.create_post(:forum_api_key => fak,
:thread_id => thread_id,
:message => c.ivars["content"],
:author_name => c.ivars["author_name"],
:author_email => c.ivars["author_email"],
:author_url => c.ivars["author_url"],
:created_at => Time.parse(c.ivars["created_at"].to_s).strftime("%Y-%m-%dT%H:%M"))
puts "======= create_post result BEGIN ====="
comment.each { |k,v| puts "#{k} => #{v}" }
puts "======= create_post result END ====="
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment