Skip to content

Instantly share code, notes, and snippets.

@pnlybubbles
Created July 25, 2015 05:27
Show Gist options
  • Save pnlybubbles/41adcbfa0cde3729bf43 to your computer and use it in GitHub Desktop.
Save pnlybubbles/41adcbfa0cde3729bf43 to your computer and use it in GitHub Desktop.
形態素解析と特徴分析を適当にやってみた。学習データはtwitterから。
require 'natto'
require 'twitter'
require 'yaml'
key_data = YAML.load_file(File.expand_path('./key_token.yml'))
$api = Twitter::REST::Client.new do |config|
config.consumer_key = key_data['consumer_key']
config.consumer_secret = key_data['consumer_secret']
config.access_token = key_data['access_token']
config.access_token_secret = key_data['access_token_secret']
end
$natto = Natto::MeCab.new
$data1 = {}
puts "loading"
if File.exist?('data.json')
$data1 = JSON.parse(File.read('data.json'))
end
def parse_text(text)
arr = []
$natto.parse(text) do |n|
# puts "#{n.surface}\t#{n.feature}"
arr << [n.surface] + n.feature.split(',') unless n.surface.empty?
end
return arr.find_all { |v| v[0] =~ /[ぁ-んァ-ヶー一-龠]+/ }
end
def text_to_vec(text)
arr = parse_text(text)
arr.each_with_index { |v, i|
backward = i == 0 ? '' : arr[i - 1]
forward = i == arr.length - 1 ? '' : arr[i + 1]
$data1[v[0]] ||= {
'backward' => {},
# 'backward_part' => {},
'forward' => {},
# 'forward_part' => {},
'relative' => {}
}
$data1[v[0]]['backward'][backward[0]] ||= 0
$data1[v[0]]['backward'][backward[0]] += 1
# $data1[v[0]]['backward_part'][backward[1]] ||= 0
# $data1[v[0]]['backward_part'][backward[1]] += 1
$data1[v[0]]['forward'][forward[0]] ||= 0
$data1[v[0]]['forward'][forward[0]] += 1
# $data1[v[0]]['forward_part'][forward[1]] ||= 0
# $data1[v[0]]['forward_part'][forward[1]] += 1
arr.each_with_index { |v_, i_|
next if i == i_
next if v_[1] != '名詞'
$data1[v[0]]['relative'][v_[0]] ||= 0
$data1[v[0]]['relative'][v_[0]] += 1
}
}
end
def denoise(arr)
s_score = 0
sum_score = arr.map { |v| v[1] }.inject(:+)
return arr.sort { |a, b|
b[1] <=> a[1]
}.take_while { |v|
s_score += v[1]
s_score <= sum_score / 2
}
end
def inner_prd(vec1, vec2)
prd = 0
(vec1.map { |v| v[0] } + vec2.map { |v| v[0] }).uniq.each { |comp|
v1 = vec1.find { |v| v[0] == comp } || []
v2 = vec2.find { |v| v[0] == comp } || []
prd += (v1[1] || 0) * (v2[1] || 0)
}
return prd
end
def normalize(arr)
max = arr.max
return arr.map { |v| v.to_f / max }
end
def arr_prd(arr1, *arr2)
return arr1.zip(*arr2).map { |v| v.inject(:*) }
end
puts "started\n\"quit\" or \"exit\" to terminate"
process = nil
rel_depth = 1
loop {
input = gets.strip
break if input == 'quit' || input == 'exit'
process = Thread.new {
begin
arr = parse_text(input)
noun = arr.find_all { |v| v[1] == '名詞' || v[1] == '形容詞' || v[1] == '動詞' || v[1] == '接頭詞' }.map { |v|
if v[1] == '形容詞' || v[1] == '動詞'
v[7]
else
v[0]
end
}
p noun
if noun.length == 0
puts 'no noun found'
exit 1
end
$api.search(noun.join(' ')).to_h[:statuses].each { |v|
text_to_vec(v[:text])
}
rel_vec = []
noun.each { |v|
next unless $data1[v]
rel_vec += $data1[v]['relative'].to_a
}
rel_vec = rel_vec.sort { |a, b|
b[1] <=> a[1]
}
rel = rel_vec.map { |v| v[0] }
ret = []
high_scores = []
w = rel[0]
10.times { |i|
ret << w
p w
rel[0, rel_depth].each { |r|
unless $data1[r]
$api.search("#{r} #{w}").to_h[:statuses].each { |v|
text_to_vec(v[:text])
}
end
}
data = $data1[w]
break unless data
w = nil
f_vec = data['forward'].to_a
target_r_vecs = []
rel[0, rel_depth].each { |r|
next unless $data1[r]
target_r_vecs << denoise($data1[r]['relative'].to_a)
}
f_comp = []
prds = []
f_vec.each { |v|
next unless $data1[v[0]]
current_r_vec = denoise($data1[v[0]]['relative'].to_a)
c_prds = []
target_r_vecs.each { |vec|
c_prds << inner_prd(current_r_vec, vec)
}
# p [prd, v[1], prd * v[1], v[0]]
f_comp << v[1]
prds << c_prds
}
prds_tr_nrm = prds.transpose.map { |v| normalize(v) }
scores = arr_prd(normalize(f_comp), *prds_tr_nrm)
high_score = 0
loop {
high_score = scores.max
break if high_score.nan?
index = scores.index(high_score)
break unless f_vec[index]
w = f_vec[index][0]
if ret.index(w)
scores.delete_at(index)
f_vec.delete_at(index)
else
break
end
}
high_scores << high_score
puts "(#{high_score})"
break unless w
}
print "\n"
puts ret.join(' ')
puts "score: #{high_scores.inject(:+) / high_scores.length}"
p rel[0, 10]
puts 'end'
rescue Exception => e
unless e.to_s == 'exit'
puts e
puts e.backtrace
end
end
}
}
puts 'saving...'
File.write('data.json', JSON.generate($data1))
puts 'terminate'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment