-
-
Save brianburridge/d28fd59ecd097c140be2 to your computer and use it in GitHub Desktop.
Convert Wordpress xml export to Jekyll format using Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Input: WordPress XML export file. | |
# Outputs: a series of Markdown files ready to be included in a Jekyll site, | |
# and comments.yml which contains all approved comments with metadata which | |
# can be used for a Disqus import. | |
# Changes from the original gist: http://gist.github.com/268428 | |
# 1. Handles titles containing special characters. Those have to be YAML escaped | |
# 2. Use the original permalinks in wordpress. | |
require 'rubygems' | |
require 'hpricot' | |
require 'html2markdown' | |
require 'time' | |
require 'yaml' | |
require 'fileutils' | |
require 'uri' | |
WORDPRESS_XML_FILE_PATH = File.join(File.dirname(__FILE__), ARGV[0]) | |
OUTPUT_PATH = File.join(File.dirname(__FILE__), "/_posts") | |
ORIGINAL_DOMAIN = "http://rubenlaguna.com" | |
class Post | |
attr_accessor :title, :post_date, :created_at, :slug, :post_id, :content, :markdown_content | |
attr_accessor :hpricot_element, :permalink, :categories | |
def initialize(item) | |
@hpricot_element = item | |
@title = item.search("title").first.inner_text | |
@permalink = item.at("link2").inner_text.gsub(/^#{ORIGINAL_DOMAIN}/,'').gsub(/\/$/,"/index.html") | |
@permalink = URI.unescape(@permalink) | |
puts "permalink #{@permalink}" | |
@post_date = item.search("wp:post_date").first.inner_text | |
@created_at = Date.parse(post_date) | |
@slug = item.search("wp:post_name").first.inner_text | |
@categories = [] | |
item.search("category").each { |cat| | |
@categories << cat.inner_text | |
} | |
@categories.uniq! | |
p @categories | |
@post_id = item.at("wp:post_id").inner_text | |
@content = item.search("content:encoded").first.inner_text | |
text = HTMLPage.new :contents => content | |
@markdown_content = text.markdown | |
end | |
def to_jekyll | |
buf = "" | |
buf << "---\n" | |
buf << "layout: post\n" | |
buf << "title: #{title}\n" #escape character in title | |
buf << "permalink: #{permalink}\n" | |
buf << "post_id: #{post_id}\n" | |
buf << "categories: #{categories.to_yaml.gsub(/^---/,'').chomp}\n" | |
buf << "---\n\n" | |
buf << markdown_content | |
end | |
def save(root_path) | |
File.open("#{root_path}/#{created_at}-#{slug}.md", "w") { |file| file.write self.to_jekyll } | |
self | |
end | |
def save_comments(path) | |
comment_elements = @hpricot_element.search("wp:comment").reject do |c| | |
c.search("wp:comment_approved").inner_text != "1" | |
end | |
File.open("#{path}/comments.yml", "a") do |yaml_file| | |
comment_elements.collect { |el| Comment.new(self, el) }.each { |comment| comment.write_to yaml_file } | |
end | |
end | |
class << self | |
def parse(element, path) | |
return nil unless element.is_a?(Hpricot::Elem) | |
post = Post.new(element) | |
puts "saving post: #{post.title}" | |
post.save(path) | |
end | |
end | |
end | |
class Comment | |
attr_accessor :author_name, :author_email, :author_url, :content, :post | |
def initialize(post, element) | |
@post_id = post.post_id | |
@post_title = post.title | |
@author_name = element.search("wp:comment_author").first.inner_text | |
@author_email = element.search("wp:comment_author_email").first.inner_text | |
@author_url = element.search("wp:comment_author_url").first.inner_text | |
@content = element.search("wp:comment_content").first.inner_text || "" | |
comment_date = element.search("wp:comment_date_gmt").first.inner_text | |
@created_at = Time.parse("#{comment_date} GMT") | |
end | |
def write_to(file) | |
file.write self.to_yaml + "\n" unless @content.size == 0 | |
end | |
end | |
# main | |
file = File.open(WORDPRESS_XML_FILE_PATH,"rb"); | |
contents = file.read | |
# Hpricot will destroy <link> contents as it expecting a <link href=""> | |
# so we change link to link2 and Hpricot will leave it alone. | |
contents.gsub!(/link>/, "link2>") | |
doc = Hpricot(contents) | |
FileUtils.mkdir_p OUTPUT_PATH | |
File.open("#{OUTPUT_PATH}/comments.yml", "w") { |f| } | |
(doc / "item").each do |item| | |
next unless item.search("wp:status").first.inner_text == "publish" | |
post = Post.parse(item, OUTPUT_PATH) | |
post.save_comments(OUTPUT_PATH) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Takes comments.yml generated by wordpressxml2jekyll.rb and posts them to your Disqus forum. | |
# uses original WordPress post id as disqus_identifier | |
# the user key for Disqus must be stored on API-KEY file | |
# Disqus ruby api http://disqus.rubyforge.org/ | |
# sudo gem install disqus | |
require 'rubygems' | |
require 'disqus' | |
require 'disqus/api' | |
require 'yaml' | |
COMMENTS_YAML_FILE = "#{ENV['PWD']}/export/_posts/comments.yml" | |
file = File.open("API-KEY", "rb") | |
apikey = file.read | |
p "Using API-KEY: #{apikey}" | |
Disqus::defaults[:api_key] = apikey | |
forum_id = Disqus::Api.get_forum_list["message"].first["id"] | |
fak = Disqus::Api.get_forum_api_key(:forum_id => forum_id)["message"] | |
File.open(COMMENTS_YAML_FILE) do |yf| | |
YAML.each_document( yf ) do |c| | |
post_id = c.ivars["post_id"] | |
post_title = c.ivars["post_title"] | |
puts "post_id: #{post_id}" | |
puts "post_title: #{post_title}" | |
thread = Disqus::Api.thread_by_identifier(:forum_api_key => fak, :title => post_title, :identifier => post_id) | |
puts "======= thread_by_identifier result BEGIN =====" | |
thread.each { |k,v| puts "#{k} => #{v}" } | |
puts "======= thread_by_identifier result END =====" | |
thread_id = thread["message"]["thread"]["id"] | |
p "thread id: #{thread_id}\n\n" | |
comment = Disqus::Api.create_post(:forum_api_key => fak, | |
:thread_id => thread_id, | |
:message => c.ivars["content"], | |
:author_name => c.ivars["author_name"], | |
:author_email => c.ivars["author_email"], | |
:author_url => c.ivars["author_url"], | |
:created_at => Time.parse(c.ivars["created_at"].to_s).strftime("%Y-%m-%dT%H:%M")) | |
puts "======= create_post result BEGIN =====" | |
comment.each { |k,v| puts "#{k} => #{v}" } | |
puts "======= create_post result END =====" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment