Created
April 1, 2015 11:07
-
-
Save shun91/d4601920027ecbe6694f to your computer and use it in GitHub Desktop.
wikipediaとhatenaのキーワードファイルからMecabの辞書フォーマット(csv)に変換するスクリプト.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 下記ページのコードを一部修正したものです. | |
# http://sugamasao.hatenablog.com/entry/2014/11/17/000355 | |
require 'csv' | |
original_data = { | |
'wikipedia' => 'jawiki-latest-all-titles-in-ns0', | |
'hatena' => 'keywordlist_furigana.csv' | |
} | |
CSV.open("onomasticon.csv", 'w') do |csv| | |
original_data.each do |type, filename| | |
next unless File.file? filename | |
open(filename).each do |title| | |
title.strip! | |
next if title =~ %r(^[+-.$()?*/&%!"'_,]+) | |
next if title =~ /^[-.0-9]+$/ | |
next if title =~ /曖昧さ回避/ | |
next if title =~ /_\(/ | |
next if title =~ /^PJ:/ | |
next if title =~ /の登場人物/ | |
next if title =~ /一覧/ | |
title_length = title.length | |
if title_length > 3 | |
score = [-36000.0, -400 * (title_length ** 1.5)].max.to_i | |
csv << [title, nil, nil, score, '名詞', '一般', '*', '*', '*', '*', title, '*', '*', type] | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment