Skip to content

Instantly share code, notes, and snippets.

@hothero
Last active December 16, 2015 02:19
Show Gist options
  • Save hothero/5361782 to your computer and use it in GitHub Desktop.
Save hothero/5361782 to your computer and use it in GitHub Desktop.
Wordpress to Octopress Convertor
require 'fileutils'
require 'date'
require 'yaml'
require 'rexml/document'
require 'ya2yaml'
require 'uri'
include REXML
doc = Document.new(File.new(ARGV[0]))
FileUtils.rmdir "_posts"
FileUtils.mkdir_p "_posts"
site_link = XPath.first(doc, 'rss/channel/link').text
# all articles and pages
doc.elements.each("rss/channel/item[wp:status = 'publish' and (wp:post_type = 'post' or wp:post_type = 'page')]") do |e|
p e.elements['wp:post_name'].text
post = e.elements
wordpress_id = post['wp:post_id'].text
#slug = post['wp:post_name'].text
slug = wordpress_id
date = DateTime.parse(post['wp:post_date'].text)
name = "%02d-%02d-%02d-%s.textile" % [date.year, date.month, date.day, slug]
date_string = "#{date.year}-#{date.month}-#{date.day}"
title_string = post['title'].text.encode("UTF-8")
# gathering tags and categories for category and keyword
categories = []
tags = []
#<category domain="post_tag" nicename="warning"><![CDATA[warning]]></category>
#<category domain="category" nicename="os"><![CDATA[作業系統(Operating System)]]></category>
post.each('category') do |cat|
tmp = cat.attribute('domain')
if (tmp.to_s.casecmp("post_tag") == 0)
tags << cat.text
elsif (tmp.to_s.casecmp("category") == 0)
categories << cat.text
end
end
content = post['content:encoded'].text.encode("UTF-8")
# convert code tag to code block from wordpress plugin: syntaxHighlighter envolved
content = content.gsub(/\[csharp\]/, '{% codeblock lang:csharp %}')
content = content.gsub(/\[\/csharp\]/, '{% endcodeblock %}')
content = content.gsub(/\[shell\]/, '{% codeblock %}')
content = content.gsub(/\[\/shell\]/, '{% endcodeblock %}')
content = content.gsub(/\[ruby\]/, '{% codeblock lang:ruby %}')
content = content.gsub(/\[\/ruby\]/, '{% endcodeblock %}')
content = content.gsub(/\[html\]/, '{% codeblock lang:html %}')
content = content.gsub(/\[\/html\]/, '{% endcodeblock %}')
content = content.gsub(/\[python\]/, '{% codeblock lang:python %}')
content = content.gsub(/\[\/python\]/, '{% endcodeblock %}')
content = content.gsub(/\[php\]/, '{% codeblock lang:php %}')
content = content.gsub(/\[\/php\]/, '{% endcodeblock %}')
content = content.gsub(/\[javascript\]/, '{% codeblock lang:javascript %}')
content = content.gsub(/\[\/javascript\]/, '{% endcodeblock %}')
# fixed breaking edition
content = content.gsub(/<div.*>/, '')
content = content.gsub('</div>', '')
# change all absolute file links to relative
content = content.gsub("#{site_link}/wp-content/uploads", "/wp-content/uploads")
# change all category links to relative
content = content.gsub("#{site_link}/category", "/blog/categories")
# discard empty line, but making breaking edition
# content = content.gsub(/\n$/, '')
# URI.unescape: solved chinese encoding problem
article_link = URI.unescape(post['link'].text.gsub(site_link, ''))
# convert <pre></pre> blocks to {% codeblock %}{% encodebloc %}
#content = content.gsub(/<pre lang="([^"]*)">(.*?)<\/pre>/m, '`\1`')
#content = content.gsub(/<pre>/, '{% codeblock %}')
#content = content.gsub(/<pre lang="([^"]*)">/, '{% codeblock %}')
#content = content.gsub(/<\/pre>/m, '{% endcodeblock %}')
# convert headers
(1..3).each do |i|
content = content.gsub(/<h#{i}>([^<]*)<\/h#{i}>/, ('#'*i) + ' \1')
end
puts "Converting: #{name}"
=begin
data = {
'layout' => 'post',
'title' => post['title'].text,
'date' => date_string,
'comments' => true,
'categories' => categories,
}.delete_if { |k,v| v.nil? || v == ''}.to_yaml
=end
if (e.elements['wp:post_type'].text.casecmp("post") == 0)
filename = "_posts/#{name}"
elsif (e.elements['wp:post_type'].text.casecmp("page") == 0)
next
FileUtils.rmdir article_link.split("/")[-1]
FileUtils.mkdir_p article_link.split("/")[-1]
filename = "#{article_link.split("/")[-1]}/index.textile"
end
File.open(filename, "w") do |f|
f.puts "---"
f.puts "layout: #{e.elements['wp:post_type'].text}"
f.puts "title: \"#{title_string.gsub('"', '&quot;')}\""
f.puts "date: #{date_string}"
f.puts "wordpress_id: #{wordpress_id}"
# f.puts "permalink: /#{wordpress_id}/#{title_string.gsub('[', '').gsub(']', '').gsub('.', '').gsub('"', '').gsub('#', '').gsub(' ', '-')}"
f.puts "permalink: #{article_link}" if e.elements['wp:post_type'].text.casecmp("post") == 0
f.puts "comments: true"
f.puts "categories: [#{categories.join(', ')}]"
f.puts "tags: [#{tags.join(', ')}]"
# for SEO
f.puts "keywords: #{tags.join(', ')}"
#f.puts data
f.puts "---"
f.puts content
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment