Created
July 25, 2015 05:27
-
-
Save pnlybubbles/41adcbfa0cde3729bf43 to your computer and use it in GitHub Desktop.
形態素解析と特徴分析を適当にやってみた。学習データはtwitterから。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'natto' | |
require 'twitter' | |
require 'yaml' | |
key_data = YAML.load_file(File.expand_path('./key_token.yml')) | |
$api = Twitter::REST::Client.new do |config| | |
config.consumer_key = key_data['consumer_key'] | |
config.consumer_secret = key_data['consumer_secret'] | |
config.access_token = key_data['access_token'] | |
config.access_token_secret = key_data['access_token_secret'] | |
end | |
$natto = Natto::MeCab.new | |
$data1 = {} | |
puts "loading" | |
if File.exist?('data.json') | |
$data1 = JSON.parse(File.read('data.json')) | |
end | |
def parse_text(text) | |
arr = [] | |
$natto.parse(text) do |n| | |
# puts "#{n.surface}\t#{n.feature}" | |
arr << [n.surface] + n.feature.split(',') unless n.surface.empty? | |
end | |
return arr.find_all { |v| v[0] =~ /[ぁ-んァ-ヶー一-龠]+/ } | |
end | |
def text_to_vec(text) | |
arr = parse_text(text) | |
arr.each_with_index { |v, i| | |
backward = i == 0 ? '' : arr[i - 1] | |
forward = i == arr.length - 1 ? '' : arr[i + 1] | |
$data1[v[0]] ||= { | |
'backward' => {}, | |
# 'backward_part' => {}, | |
'forward' => {}, | |
# 'forward_part' => {}, | |
'relative' => {} | |
} | |
$data1[v[0]]['backward'][backward[0]] ||= 0 | |
$data1[v[0]]['backward'][backward[0]] += 1 | |
# $data1[v[0]]['backward_part'][backward[1]] ||= 0 | |
# $data1[v[0]]['backward_part'][backward[1]] += 1 | |
$data1[v[0]]['forward'][forward[0]] ||= 0 | |
$data1[v[0]]['forward'][forward[0]] += 1 | |
# $data1[v[0]]['forward_part'][forward[1]] ||= 0 | |
# $data1[v[0]]['forward_part'][forward[1]] += 1 | |
arr.each_with_index { |v_, i_| | |
next if i == i_ | |
next if v_[1] != '名詞' | |
$data1[v[0]]['relative'][v_[0]] ||= 0 | |
$data1[v[0]]['relative'][v_[0]] += 1 | |
} | |
} | |
end | |
def denoise(arr) | |
s_score = 0 | |
sum_score = arr.map { |v| v[1] }.inject(:+) | |
return arr.sort { |a, b| | |
b[1] <=> a[1] | |
}.take_while { |v| | |
s_score += v[1] | |
s_score <= sum_score / 2 | |
} | |
end | |
def inner_prd(vec1, vec2) | |
prd = 0 | |
(vec1.map { |v| v[0] } + vec2.map { |v| v[0] }).uniq.each { |comp| | |
v1 = vec1.find { |v| v[0] == comp } || [] | |
v2 = vec2.find { |v| v[0] == comp } || [] | |
prd += (v1[1] || 0) * (v2[1] || 0) | |
} | |
return prd | |
end | |
def normalize(arr) | |
max = arr.max | |
return arr.map { |v| v.to_f / max } | |
end | |
def arr_prd(arr1, *arr2) | |
return arr1.zip(*arr2).map { |v| v.inject(:*) } | |
end | |
puts "started\n\"quit\" or \"exit\" to terminate" | |
process = nil | |
rel_depth = 1 | |
loop { | |
input = gets.strip | |
break if input == 'quit' || input == 'exit' | |
process = Thread.new { | |
begin | |
arr = parse_text(input) | |
noun = arr.find_all { |v| v[1] == '名詞' || v[1] == '形容詞' || v[1] == '動詞' || v[1] == '接頭詞' }.map { |v| | |
if v[1] == '形容詞' || v[1] == '動詞' | |
v[7] | |
else | |
v[0] | |
end | |
} | |
p noun | |
if noun.length == 0 | |
puts 'no noun found' | |
exit 1 | |
end | |
$api.search(noun.join(' ')).to_h[:statuses].each { |v| | |
text_to_vec(v[:text]) | |
} | |
rel_vec = [] | |
noun.each { |v| | |
next unless $data1[v] | |
rel_vec += $data1[v]['relative'].to_a | |
} | |
rel_vec = rel_vec.sort { |a, b| | |
b[1] <=> a[1] | |
} | |
rel = rel_vec.map { |v| v[0] } | |
ret = [] | |
high_scores = [] | |
w = rel[0] | |
10.times { |i| | |
ret << w | |
p w | |
rel[0, rel_depth].each { |r| | |
unless $data1[r] | |
$api.search("#{r} #{w}").to_h[:statuses].each { |v| | |
text_to_vec(v[:text]) | |
} | |
end | |
} | |
data = $data1[w] | |
break unless data | |
w = nil | |
f_vec = data['forward'].to_a | |
target_r_vecs = [] | |
rel[0, rel_depth].each { |r| | |
next unless $data1[r] | |
target_r_vecs << denoise($data1[r]['relative'].to_a) | |
} | |
f_comp = [] | |
prds = [] | |
f_vec.each { |v| | |
next unless $data1[v[0]] | |
current_r_vec = denoise($data1[v[0]]['relative'].to_a) | |
c_prds = [] | |
target_r_vecs.each { |vec| | |
c_prds << inner_prd(current_r_vec, vec) | |
} | |
# p [prd, v[1], prd * v[1], v[0]] | |
f_comp << v[1] | |
prds << c_prds | |
} | |
prds_tr_nrm = prds.transpose.map { |v| normalize(v) } | |
scores = arr_prd(normalize(f_comp), *prds_tr_nrm) | |
high_score = 0 | |
loop { | |
high_score = scores.max | |
break if high_score.nan? | |
index = scores.index(high_score) | |
break unless f_vec[index] | |
w = f_vec[index][0] | |
if ret.index(w) | |
scores.delete_at(index) | |
f_vec.delete_at(index) | |
else | |
break | |
end | |
} | |
high_scores << high_score | |
puts "(#{high_score})" | |
break unless w | |
} | |
print "\n" | |
puts ret.join(' ') | |
puts "score: #{high_scores.inject(:+) / high_scores.length}" | |
p rel[0, 10] | |
puts 'end' | |
rescue Exception => e | |
unless e.to_s == 'exit' | |
puts e | |
puts e.backtrace | |
end | |
end | |
} | |
} | |
puts 'saving...' | |
File.write('data.json', JSON.generate($data1)) | |
puts 'terminate' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment