Skip to content

Instantly share code, notes, and snippets.

@maraigue
Last active December 9, 2018 09:52
Show Gist options
  • Save maraigue/d2971e8b3e4165fe4f639defe64aee2f to your computer and use it in GitHub Desktop.
Save maraigue/d2971e8b3e4165fe4f639defe64aee2f to your computer and use it in GitHub Desktop.
はてなグループからエクスポートしたXMLファイルを、MovableType形式に変換する
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
require 'text/hatena'
require 'rexml/document'
AUTHOR = "BlogAuthor" # 著者名を入力してください
def check_node(node, expected_name)
case node
when REXML::Element
# check the name
unless expected_name === node.name
STDERR.puts "ERROR: tag name \"#{node.name}\" (expected tag name: #{expected_name.inspect})"
exit
end
true
when REXML::Text
if node.value =~ /\A\s*\z/
# do nothing: just skip it
false
else
STDERR.puts "ERROR: unexpected text node: #{node.value.inspect}"
exit
end
else
STDERR.puts "[XML_INFO #{node.class}] #{node}"
false
end
end
$parser = nil
def hatena2html(text)
unless $parser
$parser = Text::Hatena.new
end
text_dup = text.dup
text_dup.force_encoding("ASCII-8BIT")
$parser.parse(text_dup)
html = $parser.html.force_encoding("ASCII-8BIT").strip
if html[0..20] == '<div class="section">'.force_encoding("ASCII-8BIT") && html[-6..-1] == '</div>'.force_encoding("ASCII-8BIT")
html = html[21..-7]
else
STDERR.puts "<div class=\"section\"> expected but not found"
exit
end
html
end
def main(fname)
buf = nil
open fname, "r:utf-8" do |f|
buf = f.read
end
doc = REXML::Document.new(buf)
doc.each do |diary|
next unless check_node(diary, "diary")
STDERR.puts "Number of diary entries: #{diary.size}"
diary.each do |day|
next unless check_node(day, "day")
date = day.attribute("date").value.strip
title = day.attribute("title").value.strip
bodies = day.elements["body"]
if bodies.size != 1
STDERR.puts "ERROR: multiple bodies"
exit
end
body = bodies[0]
comments = day.elements["comments"]
STDERR.puts "Processing #{date} #{title} ..."
hatena_bodies = body.value.split(/\n(?=\*[^\*])/)
hatena_bodies.shift if hatena_bodies.first.empty?
second = 59
hatena_bodies.each do |hb|
#STDERR.puts " #{hb[0..40].gsub(/\s+/, " ")}"
unless hb =~ /\A\*([^\*]+)\*/
STDERR.puts "Title not given:\n#{hb}"
exit
end
hb_key = $1
hb_entry = $'.strip
if hb_entry =~ /\n/
hb_title = $`
hb_body = $'
else
STDERR.puts "Body not found: #{hb_entry}"
exit
end
hb_time = nil
if hb_key =~ /\A\d{9,}\z/
hb_time = Time.at(Integer(hb_key)).localtime
else
if date =~ /\A(\d{4})-0*(\d+)-0*(\d+)\z/
hb_time = Time.local(Integer($1), Integer($2), Integer($3), 23, 59, second)
second -= 1
else
STDERR.puts "Invalid date format: #{date}"
exit
end
end
hb_tags = []
while hb_title =~ /\A\[([^\]]+)\]/
hb_tags << $1
hb_title = $'
end
hb_body.gsub!(/-----\n/, "-----<span></span>\n")
hb_body_ext = nil
if hb_body =~ /^=====?$/
hb_body_ext = $'
hb_body = $`
end
puts <<MTENT
AUTHOR: #{AUTHOR}
TITLE: #{hb_title}
DATE: #{hb_time.strftime("%m/%d/%Y %r")}
MTENT
hb_tags.each do |tag|
puts "CATEGORY: #{tag}"
end
puts "-----"
puts "BODY:"
puts hatena2html(hb_body)
puts "-----"
if hb_body_ext
puts "EXTENDED BODY:"
puts hatena2html(hb_body_ext)
puts "-----"
end
puts "--------"
end
end
end
end # end of "def main"
if ARGV.empty?
STDERR.puts "Usage: hatenagroup2movabletype.rb FILES..."
exit
end
ARGV.each do |fname|
main fname
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment