Created
July 9, 2012 05:03
-
-
Save mh61503891/3074296 to your computer and use it in GitHub Desktop.
「Judy先生の英語科学論文の書き方」付属のCD-ROMのデータをDictionary.app用のXMLデータに変換するスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/opt/local/bin/ruby1.9 | |
# encoding: utf-8 | |
# 「Judy先生の英語科学論文の書き方」[1]付属のCD-ROMのデータを | |
# Dictionary.app用のXMLデータに変換するスクリプト。 | |
# Usage :: ruby1.9 judy-to-dictionary.rb > MyDictionary.xml && make && make install | |
# | |
# Author :: Masayuki Higashino | |
# | |
# References :: | |
# [1] Judy先生の英語科学論文の書き方 | |
# http://www.amazon.co.jp/dp/4061539523 | |
###### 設定 ################################################ | |
# CD-ROMのルートディレクトリ | |
JUDY_HOME = '/Users/masayuki/Documents/man/English/Judy/' | |
############################################################ | |
require 'nokogiri' | |
require 'rexml/document' | |
require 'json' | |
DICT = {} | |
def format(word) | |
key = String.new(word) | |
key.gsub!(/ | | | /, '') # 二重の空白を除去 | |
key.gsub!('/ ', '/') | |
key.gsub!(/\A( | )*/, '') # 先頭の空白を削除 | |
key.gsub!(/( | )*\z/, '') # 末尾の空白を削除 | |
key | |
end | |
def en_word_to_key(en_word) | |
key = String.new(en_word) | |
key.gsub!(/,| |\//, '_') # カンマや空白やスラッシュはアンダーバーに変換 | |
key.gsub!(/_{2,}?/, '_') # 2つ以上の連続したアンダーバーは1つに変換 | |
key | |
end | |
html = open(File.join(JUDY_HOME, 'judy-paper/text/part3.html'), 'r:ascii-8bit'){|i|i.read} | |
html.encode!(Encoding::UTF_8, Encoding::CP932, | |
:undef => :replace, | |
:replace => '_', | |
:universal_newline => true | |
) | |
html.gsub!(/\n/, ' ') # 変な改行をとりあえず空白にしとく | |
@event = {} | |
doc = Nokogiri::HTML(html).xpath('//p').each do |p| | |
if p.xpath('font[1]').text =~ /\A∽{5}/ | |
@event[:type] = :definition | |
elsif p.xpath('font[1]').text =~ /\A…{5}/ | |
@event[:type] = :description | |
elsif p.xpath('font[1]').text =~ /基本例文|基本文例/ | |
context = p.xpath('font[1]').text.scan(/(基本例文|基本文例).*?:(.+?)>/).flatten.last | |
@event[:context] = context | |
@event[:type] = :example_en | |
elsif @event[:type] == :definition | |
definition = p.children.map{|e|format(e.text)}.reject{|e|e.empty?} | |
# '(動詞)' to '動詞' | |
definition[2] = definition[2].scan(/((.+?))/).flatten.first | |
en = definition[0] | |
ja = definition[1] | |
pos = definition[2] | |
key = en_word_to_key(en) | |
DICT[key] ||= {} | |
DICT[key][:key] ||= key | |
DICT[key][:title] ||= en | |
DICT[key][:pos] ||= {} | |
DICT[key][:pos][pos] ||= {} | |
DICT[key][:pos][pos][:en] ||= en | |
DICT[key][:pos][pos][:ja] ||= ja | |
@event[:key] = key | |
@event[:pos] = pos | |
elsif @event[:type] == :description | |
description = format(p.text) | |
description.gsub!(/\A(☆|★)/, '') | |
description.gsub!(/\A |/, '') | |
description.gsub!(/ |(, )/, ',') | |
key = @event[:key] | |
pos = @event[:pos] | |
DICT[key][:pos][pos][:description] ||= [] | |
DICT[key][:pos][pos][:description] << description | |
elsif @event[:type] == :example_en | |
example = {} | |
example_en = format(p.text) | |
example[:en] = example_en unless example_en.empty? | |
example[:context] = @event[:context] unless @event[:context].nil? | |
@event[:example] = example | |
@event[:type] = :example_ja | |
elsif @event[:type] == :example_ja | |
example = @event[:example] | |
example_ja = format(p.text) | |
example_ja.gsub!(/\A((|\()/, '') | |
example_ja.gsub!(/()|\))\z/, '') | |
example_ja = format(example_ja) | |
example[:ja] = example_ja unless example_ja.empty? | |
key = @event[:key] | |
pos = @event[:pos] | |
DICT[key][:pos][pos][:example] ||= [] | |
DICT[key][:pos][pos][:example] << example | |
@event[:example] = example | |
@event[:type] = :example_en | |
end | |
end | |
#puts JSON.pretty_generate(DICT) | |
#exit | |
############################################################ | |
CONTEXTS = {} | |
html = open(File.join(JUDY_HOME, 'judy-paper/text/part2.html'), 'r:ascii-8bit'){|i|i.read} | |
html.encode!(Encoding::UTF_8, Encoding::CP932, | |
:undef => :replace, | |
:replace => '_', | |
:universal_newline => true) | |
html.gsub!(/\n/, ' ') # 変な改行をとりあえず空白にしておく | |
doc = Nokogiri::HTML(html).xpath('//table').each do |table| | |
if table[:width] == '70%' | |
table.xpath('tr').each_with_index do |tr, i| | |
next if i.zero? # 1行目は無視 | |
entry = tr.xpath('td').map{|td|format(td.text)} | |
label = entry[1].downcase | |
if label.empty? | |
unless entry[0].empty? | |
@section = entry[0] | |
end | |
else | |
context = '' | |
if label =~ /\d\z/ # ラベルの末尾が番号ならサブセクション | |
context = [@section, entry[0], entry[2]].compact.join('.') | |
else # そうでなければ単体のセクション | |
context = [entry[0], entry[2]].compact.join('.') | |
end | |
context.gsub!(/ /, '') # 空白除去 | |
context.gsub!('/FONT>', '定') # タグ入力ミスを修正 | |
context << '.' unless context =~ /.\z/ # 末尾に句点が無ければ付与 | |
CONTEXTS[label] = context | |
end | |
end | |
break | |
end | |
end | |
#puts JSON.pretty_generate(CONTEXTS) | |
#exit | |
############################################################ | |
html = open(File.join(JUDY_HOME, 'judy-paper/text/ex/abc-all.html'), 'r:ascii-8bit'){|i|i.read} | |
html.encode!(Encoding::UTF_8, Encoding::CP932, | |
:undef => :replace, | |
:replace => '_', | |
:universal_newline => true) | |
html.gsub!(/\n/, ' ') # 変な改行をとりあえず空白にしておく | |
doc = Nokogiri::HTML(html).xpath('//tr').each do |p| | |
entry = p.xpath('td/font').map{|e|e.text} | |
en = format(entry[0]) | |
ja = format(entry[1]) | |
pos = format(entry[2]) | |
label = format(entry[3]) | |
example = format(entry[4]) | |
key = en_word_to_key(en) | |
DICT[key] ||= {} | |
DICT[key][:key] ||= key | |
DICT[key][:title] ||= en | |
DICT[key][:pos] ||= {} | |
DICT[key][:pos][pos] ||= {} | |
DICT[key][:pos][pos][:en] ||= en | |
DICT[key][:pos][pos][:ja] ||= ja | |
DICT[key][:pos][pos][:description] ||= [] | |
DICT[key][:pos][pos][:example] ||= [] | |
map = {} | |
map[:label] = label | |
map[:context] = CONTEXTS[label] if CONTEXTS.include?(label) | |
map[:en] = example | |
DICT[key][:pos][pos][:example] << map | |
end | |
#puts JSON.pretty_generate(DICT) | |
#exit | |
############################################################ | |
xml = REXML::Document.new | |
xml.context[:attribute_quote] = :quote | |
xml << REXML::XMLDecl.new('1.0', 'UTF-8') | |
dic = xml.add_element('d:dictionary',{ | |
'xmlns' =>'http://www.w3.org/1999/xhtml', | |
'xmlns:d' => 'http://www.apple.com/DTDs/DictionaryService-1.0.rng' | |
}) | |
##### コンテキスト | |
CONTEXTS.each do |key, value| | |
id = key.upcase | |
description = value | |
entry = dic.add_element('d:entry', { | |
'id' => id, | |
'd:title' => description | |
}) | |
entry.add_element('d:index', { | |
'd:value'=>id, | |
'd:title'=>id | |
}) | |
entry.add_element('h1').add_text(id) | |
entry.add_element('p').add_text(description) | |
end | |
##### 辞書 | |
DICT.each do |key, value| | |
### :) | |
# value[:key] | |
# value[:title] | |
# value[:pos].each do |pos, pos_value| | |
# pos_value[:en] | |
# pos_value[:ja] | |
# pos_value[:description].each do |pos_description| | |
# end | |
# pos_value[:example].each do |pos_example| | |
# pos_example[:label] | |
# pos_example[:context] | |
# pos_example[:en] | |
# pos_example[:ja] | |
# end | |
# end | |
### | |
entry = dic.add_element('d:entry', { | |
'id' => value[:key], | |
'd:title' => value[:title] | |
}) | |
# インデックスを付ける | |
entry.add_element('d:index', { | |
'd:value' => value[:title], | |
'd:title' => value[:title] | |
}) | |
# 単語分割してインデックスを付ける | |
value[:title].split(/[ ,.-;:]/).reject{|e|e.empty?}.map{|e|e.downcase}.each do |word| | |
entry.add_element('d:index', { | |
'd:value'=>word, | |
'd:title'=>value[:title] | |
}) | |
end | |
# タイトル | |
entry.add_element('h1').add_text(value[:title]) | |
# 品詞別 | |
value[:pos].each do |pos, pos_value| | |
# 品詞名 | |
entry.add_element('span', {'class'=>'encl'}).add_text(pos) | |
ul = entry.add_element('ul') | |
# 意味 | |
if pos_value.include?(:description) && !pos_value[:description].empty? | |
ul.add_element('li').add_text([pos_value[:ja], '▼', pos_value[:description]].flatten.join('')) | |
else | |
ul.add_element('li').add_text(pos_value[:ja]) | |
end | |
# 用例 | |
pos_value[:example].each do |pos_example| | |
li = ul.add_element('li') | |
li.add_element('span').add_text(pos_example[:en]) | |
if pos_example.include?(:ja) | |
li.add_element('br') | |
li.add_element('span').add_text(pos_example[:ja]) | |
end | |
if pos_example.include?(:label) | |
li.add_element('br') | |
li.add_element('span', {'class'=>'context'}).add_text(pos_example[:label].upcase) | |
elsif pos_example.include?(:context) | |
li.add_element('br') | |
li.add_element('span', {'class'=>'context'}).add_text(pos_example[:context]) | |
end | |
end | |
end | |
end | |
xml.write($stdout, 4) | |
puts | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment