Skip to content

Instantly share code, notes, and snippets.

@mh61503891
Created July 9, 2012 05:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mh61503891/3074296 to your computer and use it in GitHub Desktop.
Save mh61503891/3074296 to your computer and use it in GitHub Desktop.
「Judy先生の英語科学論文の書き方」付属のCD-ROMのデータをDictionary.app用のXMLデータに変換するスクリプト
#!/opt/local/bin/ruby1.9
# encoding: utf-8
# 「Judy先生の英語科学論文の書き方」[1]付属のCD-ROMのデータを
# Dictionary.app用のXMLデータに変換するスクリプト。
# Usage :: ruby1.9 judy-to-dictionary.rb > MyDictionary.xml && make && make install
#
# Author :: Masayuki Higashino
#
# References ::
# [1] Judy先生の英語科学論文の書き方
# http://www.amazon.co.jp/dp/4061539523
###### 設定 ################################################
# CD-ROMのルートディレクトリ
JUDY_HOME = '/Users/masayuki/Documents/man/English/Judy/'
############################################################
require 'nokogiri'
require 'rexml/document'
require 'json'
DICT = {}
def format(word)
key = String.new(word)
key.gsub!(/ |  |  |  /, '') # 二重の空白を除去
key.gsub!('/ ', '/')
key.gsub!(/\A( | )*/, '') # 先頭の空白を削除
key.gsub!(/( | )*\z/, '') # 末尾の空白を削除
key
end
def en_word_to_key(en_word)
key = String.new(en_word)
key.gsub!(/,| |\//, '_') # カンマや空白やスラッシュはアンダーバーに変換
key.gsub!(/_{2,}?/, '_') # 2つ以上の連続したアンダーバーは1つに変換
key
end
html = open(File.join(JUDY_HOME, 'judy-paper/text/part3.html'), 'r:ascii-8bit'){|i|i.read}
html.encode!(Encoding::UTF_8, Encoding::CP932,
:undef => :replace,
:replace => '_',
:universal_newline => true
)
html.gsub!(/\n/, ' ') # 変な改行をとりあえず空白にしとく
@event = {}
doc = Nokogiri::HTML(html).xpath('//p').each do |p|
if p.xpath('font[1]').text =~ /\A∽{5}/
@event[:type] = :definition
elsif p.xpath('font[1]').text =~ /\A…{5}/
@event[:type] = :description
elsif p.xpath('font[1]').text =~ /基本例文|基本文例/
context = p.xpath('font[1]').text.scan(/(基本例文|基本文例).*?:(.+?)>/).flatten.last
@event[:context] = context
@event[:type] = :example_en
elsif @event[:type] == :definition
definition = p.children.map{|e|format(e.text)}.reject{|e|e.empty?}
# '(動詞)' to '動詞'
definition[2] = definition[2].scan(/((.+?))/).flatten.first
en = definition[0]
ja = definition[1]
pos = definition[2]
key = en_word_to_key(en)
DICT[key] ||= {}
DICT[key][:key] ||= key
DICT[key][:title] ||= en
DICT[key][:pos] ||= {}
DICT[key][:pos][pos] ||= {}
DICT[key][:pos][pos][:en] ||= en
DICT[key][:pos][pos][:ja] ||= ja
@event[:key] = key
@event[:pos] = pos
elsif @event[:type] == :description
description = format(p.text)
description.gsub!(/\A(☆|★)/, '')
description.gsub!(/\A |/, '')
description.gsub!(/ |(, )/, ',')
key = @event[:key]
pos = @event[:pos]
DICT[key][:pos][pos][:description] ||= []
DICT[key][:pos][pos][:description] << description
elsif @event[:type] == :example_en
example = {}
example_en = format(p.text)
example[:en] = example_en unless example_en.empty?
example[:context] = @event[:context] unless @event[:context].nil?
@event[:example] = example
@event[:type] = :example_ja
elsif @event[:type] == :example_ja
example = @event[:example]
example_ja = format(p.text)
example_ja.gsub!(/\A((|\()/, '')
example_ja.gsub!(/()|\))\z/, '')
example_ja = format(example_ja)
example[:ja] = example_ja unless example_ja.empty?
key = @event[:key]
pos = @event[:pos]
DICT[key][:pos][pos][:example] ||= []
DICT[key][:pos][pos][:example] << example
@event[:example] = example
@event[:type] = :example_en
end
end
#puts JSON.pretty_generate(DICT)
#exit
############################################################
CONTEXTS = {}
html = open(File.join(JUDY_HOME, 'judy-paper/text/part2.html'), 'r:ascii-8bit'){|i|i.read}
html.encode!(Encoding::UTF_8, Encoding::CP932,
:undef => :replace,
:replace => '_',
:universal_newline => true)
html.gsub!(/\n/, ' ') # 変な改行をとりあえず空白にしておく
doc = Nokogiri::HTML(html).xpath('//table').each do |table|
if table[:width] == '70%'
table.xpath('tr').each_with_index do |tr, i|
next if i.zero? # 1行目は無視
entry = tr.xpath('td').map{|td|format(td.text)}
label = entry[1].downcase
if label.empty?
unless entry[0].empty?
@section = entry[0]
end
else
context = ''
if label =~ /\d\z/ # ラベルの末尾が番号ならサブセクション
context = [@section, entry[0], entry[2]].compact.join('.')
else # そうでなければ単体のセクション
context = [entry[0], entry[2]].compact.join('.')
end
context.gsub!(/ /, '') # 空白除去
context.gsub!('/FONT>', '定') # タグ入力ミスを修正
context << '.' unless context =~ /.\z/ # 末尾に句点が無ければ付与
CONTEXTS[label] = context
end
end
break
end
end
#puts JSON.pretty_generate(CONTEXTS)
#exit
############################################################
html = open(File.join(JUDY_HOME, 'judy-paper/text/ex/abc-all.html'), 'r:ascii-8bit'){|i|i.read}
html.encode!(Encoding::UTF_8, Encoding::CP932,
:undef => :replace,
:replace => '_',
:universal_newline => true)
html.gsub!(/\n/, ' ') # 変な改行をとりあえず空白にしておく
doc = Nokogiri::HTML(html).xpath('//tr').each do |p|
entry = p.xpath('td/font').map{|e|e.text}
en = format(entry[0])
ja = format(entry[1])
pos = format(entry[2])
label = format(entry[3])
example = format(entry[4])
key = en_word_to_key(en)
DICT[key] ||= {}
DICT[key][:key] ||= key
DICT[key][:title] ||= en
DICT[key][:pos] ||= {}
DICT[key][:pos][pos] ||= {}
DICT[key][:pos][pos][:en] ||= en
DICT[key][:pos][pos][:ja] ||= ja
DICT[key][:pos][pos][:description] ||= []
DICT[key][:pos][pos][:example] ||= []
map = {}
map[:label] = label
map[:context] = CONTEXTS[label] if CONTEXTS.include?(label)
map[:en] = example
DICT[key][:pos][pos][:example] << map
end
#puts JSON.pretty_generate(DICT)
#exit
############################################################
xml = REXML::Document.new
xml.context[:attribute_quote] = :quote
xml << REXML::XMLDecl.new('1.0', 'UTF-8')
dic = xml.add_element('d:dictionary',{
'xmlns' =>'http://www.w3.org/1999/xhtml',
'xmlns:d' => 'http://www.apple.com/DTDs/DictionaryService-1.0.rng'
})
##### コンテキスト
CONTEXTS.each do |key, value|
id = key.upcase
description = value
entry = dic.add_element('d:entry', {
'id' => id,
'd:title' => description
})
entry.add_element('d:index', {
'd:value'=>id,
'd:title'=>id
})
entry.add_element('h1').add_text(id)
entry.add_element('p').add_text(description)
end
##### 辞書
DICT.each do |key, value|
### :)
# value[:key]
# value[:title]
# value[:pos].each do |pos, pos_value|
# pos_value[:en]
# pos_value[:ja]
# pos_value[:description].each do |pos_description|
# end
# pos_value[:example].each do |pos_example|
# pos_example[:label]
# pos_example[:context]
# pos_example[:en]
# pos_example[:ja]
# end
# end
###
entry = dic.add_element('d:entry', {
'id' => value[:key],
'd:title' => value[:title]
})
# インデックスを付ける
entry.add_element('d:index', {
'd:value' => value[:title],
'd:title' => value[:title]
})
# 単語分割してインデックスを付ける
value[:title].split(/[ ,.-;:]/).reject{|e|e.empty?}.map{|e|e.downcase}.each do |word|
entry.add_element('d:index', {
'd:value'=>word,
'd:title'=>value[:title]
})
end
# タイトル
entry.add_element('h1').add_text(value[:title])
# 品詞別
value[:pos].each do |pos, pos_value|
# 品詞名
entry.add_element('span', {'class'=>'encl'}).add_text(pos)
ul = entry.add_element('ul')
# 意味
if pos_value.include?(:description) && !pos_value[:description].empty?
ul.add_element('li').add_text([pos_value[:ja], '▼', pos_value[:description]].flatten.join(''))
else
ul.add_element('li').add_text(pos_value[:ja])
end
# 用例
pos_value[:example].each do |pos_example|
li = ul.add_element('li')
li.add_element('span').add_text(pos_example[:en])
if pos_example.include?(:ja)
li.add_element('br')
li.add_element('span').add_text(pos_example[:ja])
end
if pos_example.include?(:label)
li.add_element('br')
li.add_element('span', {'class'=>'context'}).add_text(pos_example[:label].upcase)
elsif pos_example.include?(:context)
li.add_element('br')
li.add_element('span', {'class'=>'context'}).add_text(pos_example[:context])
end
end
end
end
xml.write($stdout, 4)
puts
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment