Created
May 10, 2013 09:08
-
-
Save masao/5553353 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 管理番号 ユーザID 氏名(姓) 氏名(名) 削除フラグ 種別 削除列 削除列 寄与区分 名称・和文 名称・英文 状態 出願番号 出願番号・英文 出願年 出願月 出願日 公開番号 公開番号・英文 公開年 公開月 公開日 登録番号 登録番号・英文 登録年 登録月 登録日 実績集計年度 出願人名一覧 発明者名一覧 概要 国内外別 出願国・国名 削除列 実用化・企業名 実用化・内容 公開対象 ReaD用選択 変更ユーザID 管理者更新日 本人更新日 更新日 データ登録日 削除列 削除列 国名コード | |
prefix_done = {} | |
suffix_done = {} | |
new = [] | |
duplicates = [] | |
dup_ids = {} | |
prefix_size = 10 | |
suffix_size = 10 | |
def prefix( str, size = 10 ) | |
str[ 0, size ] | |
end | |
def suffix( str, size = 10 ) | |
str[ -size .. -1 ] | |
end | |
while ARGV[0] and ARGV[0] =~ /^-/ | |
opt = ARGV.shift | |
case opt | |
when "-prefix" | |
prefix_size = ARGV.shift.to_i | |
when "-suffix" | |
suffix_size = ARGV.shift.to_i | |
end | |
end | |
STDERR.puts "Prefix size: #{ prefix_size }" | |
STDERR.puts "Suffix size: #{ suffix_size }" | |
counts = { :prefix => 0, :suffix => 0 } | |
ARGF.set_encoding( "Shift_JIS" ) if ARGF.respond_to?( :set_encoding ) | |
ARGF.gets | |
ARGF.each do |line| | |
data = line.chomp.split( /\t/ ) | |
data_id = data.first | |
title = data[ 9 ].sub( /^\"/, "" ).sub( /\"$/, "" ).strip | |
# title = NKF.nkf( "-Sw", title ) | |
# p title[ 0, 5 ] | |
prefix = prefix( title, prefix_size ) | |
suffix = suffix( title, suffix_size ) | |
if prefix and prefix_done[ prefix ] | |
duplicates << line | |
prefix_done[ prefix ] << data[ 0 ] | |
dup_ids[ data_id ] = prefix_done[ prefix ].first | |
counts[ :prefix ] += 1 | |
elsif suffix and suffix_done[ suffix ] | |
duplicates << line | |
suffix_done[ suffix ] << data[ 0 ] | |
dup_ids[ data_id ] = suffix_done[ suffix ].first | |
counts[ :suffix ] += 1 | |
else | |
new << line | |
prefix_done[ prefix ] ||= [] | |
prefix_done[ prefix ] << data[ 0 ] | |
suffix_done[ suffix ] ||= [] | |
suffix_done[ suffix ] << data[ 0 ] | |
end | |
end | |
p "New: #{ new.size }" | |
p "Duplicates: #{ duplicates.size }" | |
p counts | |
#puts prefix_done.keys | |
p "suffix" | |
puts suffix_done.keys | |
# p prefix_done | |
duplicates.each do |e| | |
data = e.split( /\t/ ) | |
data_id = data.first | |
title = data[ 9 ] | |
prefix = prefix( title, prefix_size ) | |
suffix = suffix( title, suffix_size ) | |
#dup_ids = prefix_done[ prefix ].to_a | |
#dup_ids += suffix_done[ suffix ].to_a | |
#p dup_ids.uniq.sort | |
puts "#{ dup_ids[ data_id ] }\t#{ e }" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment