require 'rubygems' | |
require 'nokogiri' | |
require 'fileutils' | |
require 'date' | |
require 'uri' | |
# usage: ruby import.rb my-blog.xml | |
# my-blog.xml is a file from Settings -> Basic -> Export in blogger. | |
data = File.read ARGV[0] | |
doc = Nokogiri::XML(data) | |
@posts = {} | |
@drafts = {} | |
def add(node) | |
id = node.search('id').first.content | |
type = node.search('category').first.attr('term').split('#').last | |
case type | |
when 'post' | |
if published?(node) | |
@posts[id] = Post.new(node) | |
else | |
@drafts[id] = Post.new(node) | |
end | |
when 'comment' | |
reply_to = node.children.find {|c| c.name == 'in-reply-to' } | |
post_id = reply_to.attr('ref') | |
#post_id = node.search('thr').first.attr('ref') | |
@posts[post_id].add_comment(Comment.new(node)) | |
when 'template', 'settings', 'page' | |
else | |
raise 'dunno '+type | |
end | |
end | |
def published?(node) | |
node.at_css('app|control app|draft', 'app' => 'http://purl.org/atom/app#').nil? | |
end | |
def write(post, path='_posts') | |
puts "Post [#{post.title}] has #{post.comments.count} comments" | |
puts "writing #{post.file_name}" | |
File.open(File.join(path, post.file_name), 'w') do |file| | |
file.write post.header | |
file.write "\n\n" | |
#file.write "<h1>{{ page.title }}</h1>\n" | |
file.write "<div class='post'>\n" | |
file.write post.content | |
file.write "</div>\n" | |
unless post.comments.empty? | |
file.write "<h2>Comments</h2>\n" | |
file.write "<div class='comments'>\n" | |
post.comments.each do |comment| | |
file.write "<div class='comment'>\n" | |
file.write "<div class='author'>" | |
file.write comment.author | |
file.write "</div>\n" | |
file.write "<div class='content'>\n" | |
file.write comment.content | |
file.write "</div>\n" | |
file.write "</div>\n" | |
end | |
file.write "</div>\n" | |
end | |
end | |
end | |
class Post | |
attr_reader :comments | |
def initialize(node) | |
@node = node | |
@comments = [] | |
end | |
def add_comment(comment) | |
@comments.unshift comment | |
end | |
def title | |
@title ||= @node.at_css('title').content | |
end | |
def content | |
@content ||= @node.at_css('content').content | |
end | |
def creation_date | |
@creation_date ||= creation_datetime.strftime("%Y-%m-%d") | |
end | |
def creation_datetime | |
@creation_datetime ||= Date.parse(@node.search('published').first.content) | |
end | |
def permalink | |
return @permalink unless @permalink.nil? | |
link_node = @node.at_css('link[rel=alternate]') | |
@permalink = link_node && link_node.attr('href') | |
end | |
def param_name | |
if permalink.nil? | |
title.split(/[^a-zA-Z0-9]+/).join('-').downcase | |
else | |
File.basename(URI(permalink).path, '.*') | |
end | |
end | |
def file_name | |
%{#{creation_date}-#{param_name}.html} | |
end | |
def header | |
[ | |
'---', | |
%{layout: post}, | |
%{title: "#{title}"}, | |
%{date: #{creation_datetime}}, | |
%{comments: false}, | |
categories, | |
'---' | |
].compact.join("\n") | |
end | |
def categories | |
terms = @node.search('category[scheme="http://www.blogger.com/atom/ns#"]') | |
unless Array(terms).empty? | |
[ | |
'categories:', | |
terms.map{ |t| t.attr('term') && " - #{t.attr('term')}" }.compact.join("\n"), | |
].join("\n") | |
end | |
end | |
end | |
class Comment | |
def initialize(node) | |
@node = node | |
end | |
def author | |
@node.search('author name').first.content | |
end | |
def content | |
@node.search('content').first.content | |
end | |
end | |
entries = {} | |
doc.search('entry').each do |entry| | |
add entry | |
end | |
puts "** Writing PUBLISHED posts" | |
FileUtils.rm_rf('_posts') | |
Dir.mkdir("_posts") unless File.directory?("_posts") | |
@posts.each do |id, post| | |
write post | |
end | |
puts "\n" | |
puts "** Writing DRAFT posts" | |
FileUtils.rm_rf('_drafts') | |
Dir.mkdir("_drafts") unless File.directory?("_drafts") | |
@drafts.each do |id, post| | |
write post, '_drafts' | |
end |
Thanks for this work! It worked well, except that it doesn't handle double quotes or backslashes in titles, such that a "rake generate" will fail with YAML errors. For example, " needs to become " and \ needs to become .
Thanks a lot. It worked well.
I tried your script to migrate my Blogger blog. I installed ruby via pacman -S ruby
, and Nokogiri via gem install nokogiri
on Arch Linux.
But your script fails with the following:
[orschiro@thinkpad Blogger to Github]$ ruby import.rb blog-08-03-2013.xml
WARNING: Nokogiri was built against LibXML version 2.8.0, but has dynamically loaded 2.9.1
import.rb:27:in `add': dunno page (RuntimeError)
from import.rb:119:in `block in <main>'
from /home/orschiro/.gem/ruby/2.0.0/gems/nokogiri-1.6.0/lib/nokogiri/xml/node_set.rb:237:in `block in each'
from /home/orschiro/.gem/ruby/2.0.0/gems/nokogiri-1.6.0/lib/nokogiri/xml/node_set.rb:236:in `upto'
from /home/orschiro/.gem/ruby/2.0.0/gems/nokogiri-1.6.0/lib/nokogiri/xml/node_set.rb:236:in `each'
from import.rb:118:in `<main>'
Can you help me, please?
EDIT: Sorry, this did not happen with your script but with the original script you had forked from.
I'm seeing the same error as @IQAndreas - gisted here: https://gist.github.com/nbieber/a2c469538bdb79d2c520
Thanks for this. It converted almost all of my posts; the last one causes the script to crash Ruby entirely from line 42. I think it's because the post has some non-ascii in the name: "Some Quốc Ngữ History".
I have nokogiri installed, but I can't do it:
import.rb:30:in `add': undefined method `add_comment' for nil:NilClass (NoMethodError)
from import.rb:158:in `block in <main>'
from /var/lib/gems/2.1.0/gems/nokogiri-1.6.6.2/lib/nokogiri/xml/node_set.rb:187:in `block in each'
from /var/lib/gems/2.1.0/gems/nokogiri-1.6.6.2/lib/nokogiri/xml/node_set.rb:186:in `upto'
from /var/lib/gems/2.1.0/gems/nokogiri-1.6.6.2/lib/nokogiri/xml/node_set.rb:186:in `each'
from import.rb:157:in `<main>'
Thank you. Fixed. I'll try to import pages too.