Skip to content

Instantly share code, notes, and snippets.

@sandmark
Created March 13, 2012 00:06
Show Gist options
  • Save sandmark/2025586 to your computer and use it in GitHub Desktop.
Save sandmark/2025586 to your computer and use it in GitHub Desktop.
Sixamo-Rails
class Dictionary < ActiveRecord::Base
after_initialize :dictionary_initialize
DICT_INIT = "line_num: 0\n"
attr_reader :text, :trie
LTL = 3
WindowSize = 500
# def Dictionary.load(dirname)
# dic = Dictionary.new(dirname)
# dic.load_text
# dic.load_dictionary
# dic
# end
def reset
self.textdata = ""
self.dict = DICT_INIT
end
def dictionary_initialize
@occur = Hash.new([])
@rel = {}
@trie = SixamoEngine::Trie.new
@text = []
@line_num = 0
self.dict = DICT_INIT if self.dict.blank?
load_text
load_dictionary
end
def load_text
# return unless File.readable?(@text_filename)
# File.open(@text_filename) do |fp|
self.textdata.each_line do |line|
line.chomp!
@text << line
end
end
def load_dictionary
# return unless File.readable?(@dic_filename)
# File.open(@dic_filename) do |fp|
# header
# fp.each do |line|
self.dict.each_line do |line|
line.chomp!
case line
when /^$/
break
when /line_num:\s*(.*)\s*$/i
@line_num = $1.to_i
else
logger.debug "[Warning] Unknown_header #{line}"
end
end
# body
# fp.each do |line|
self.dict.each_line do |line|
line.chomp!
word, num, sum, occur = line.split(/\t/)
if occur
@occur[word] = occur.split(/,/).collect { |l| l.to_i }
add_term(word)
@rel[word] = Hash.new(0)
@rel[word][:num] = num.to_i
@rel[word][:sum] = sum.to_i
end
end
end
def save_text
# tmp_filename = "#{@dirname}/sixamo.tmp.#{Process.pid}-#{rand(100)}"
# File.open(tmp_filename, 'w') do |fp|
# fp.puts @text
# end
# File.rename( tmp_filename, @text_filename )
logger.debug "WARNING: #{self.class}#save_text was called!"
self.textdata = @text.join("\n")
self.save!
end
def save_dictionary
# tmp_filename = "#{@dirname}/sixamo.tmp.#{Process.pid}-#{rand(100)}"
# File.open(tmp_filename, 'w') do |fp|
# fp.print self.to_s
# end
# File.rename( tmp_filename, @dic_filename )
logger.debug "WARNING: #{self.class}#save_dictionary was called!"
self.dict = self.to_s
self.save!
end
def learn_from_text(progress=nil)
modified = false
read_size = 0
buf_prev = []
end_flag = false
idx = @line_num
while true
buf = []
if progress
idx2 = read_size/WindowSize * WindowSize
if idx2 % 100_000 == 0
logger.debug "#{self.class}#learn_from_text: " +
sprintf("\n%5dk ", idx2/1000)
elsif idx2 % 20_000 == 0
logger.debug "#{self.class}#learn_from_text: *"
elsif idx2 % 2_000 == 0
logger.debug "#{self.class}#learn_from_text: ."
end
end
tmp = read_size
while tmp/WindowSize == read_size/WindowSize
if idx >= @text.size
end_flag = true
break
end
buf << @text[idx]
tmp += @text[idx].size
idx += 1
end
read_size = tmp
break if end_flag
if buf_prev.size > 0
learn(buf_prev+buf, @line_num)
modified = true
@line_num += buf_prev.size
end
buf_prev = buf
end
# STDERR.print "\n" if progress
modified
end
def store_text(lines)
ary = []
lines.each{ |line| ary << line.gsub(/\s+/, ' ').strip }
ary.each{ |line| @text << line }
logger.debug ary.inspect
self.textdata += "\n" if not self.textdata.blank?
self.textdata += ary.map(&:chomp).join("\n")
self.save!
end
def learn(lines,idx=nil)
new_terms = SixamoEngine::Freq.extract_terms(lines,30)
new_terms.each { |term| add_term(term) }
if idx
words_all = []
lines.each_with_index do |line,i|
num = idx + i
words = split_into_terms(line)
words_all.concat(words)
words.each do |term|
if @occur[term].empty? || num > @occur[term][-1]
@occur[term] << num
end
end
end
weight_update(words_all)
self.terms.each do |term|
occur = @occur[term]
size = occur.size
if size < 4 && size > 0 && occur[-1]+size*150 < idx
del_term(term)
end
end
end
end
def split_into_keywords(str)
result = Hash.new(0)
terms = split_into_terms(str)
terms.each do |w|
result[w] += self.weight(w)
end
result
end
def split_into_terms(str,num=nil)
@trie.split_into_terms(str,num)
end
def to_s
result = ""
# header
result << "line_num: #{@line_num}\n"
result << "\n"
@occur.delete_if { |k,v| v.size == 0 }
@occur.each { |k,v|@occur[k] = v[-100..-1] if v.size > 100 }
# body
tmp = @occur.keys.sort_by do |k|
[-@occur[k].size, @rel[k][:num], k.length, k]
end
tmp.each do |k|
result << format("%s\t\%s\t\%s\t%s\n",
k,
@rel[k][:num],
@rel[k][:sum],
@occur[k].join(','))
end
result
end
def weight_update(words)
width = 20
words.each do |term|
@rel[term] = Hash.new(0) unless @rel.key?(term)
end
size = words.size
(size-width).times do |idx1|
word1 = words[idx1]
(idx1+1).upto(idx1+width) do |idx2|
@rel[word1][:num] += 1 if word1 == words[idx2]
@rel[word1][:sum] += 1
end
end
(width+1).times do |idx1|
word1 = words[-idx1]
if word1
(idx1-1).downto(1) do |idx2|
@rel[word1][:num] += 1 if word1 == words[-idx2]
@rel[word1][:sum] += 1
end
end
end
end
def weight(word)
if !@rel.key?(word) || @rel[word][:sum] == 0
0
else
num = @rel[word][:num]
sum = @rel[word][:sum].to_f
num/(sum*(sum+100))
end
end
def lines(word)
@occur[word] || []
end
def terms
@occur.keys
end
def add_term(str)
@occur[str] = [] unless @occur.key?(str)
@trie.add(str)
@rel[str] = Hash.new(0) unless @rel.key?(str)
end
def del_term(str)
occur = @occur[str]
@occur.delete(str)
@trie.delete(str)
@rel.delete(str)
tmp = split_into_terms(str)
tmp.each { |w| @occur[w] = @occur[w].concat(occur).uniq.sort }
weight_update(tmp) if tmp.size > 0
end
end
# -*- coding: utf-8 -*-
# for ruby1.9
class String
alias :each :each_line
end
module SixamoEngine
def SixamoEngine.new(*args)
SixamoEngine::Core.new(*args)
end
def SixamoEngine.init_dictionary(dirname)
raise RuntimeError
dic = Dictionary.new(dirname)
dic.load_text
dic.learn_from_text(true)
dic
end
module Util
def Util.roulette_select(h)
return nil if h.empty?
sum = h.values.sum
return Util.random_select(h.keys) if sum == 0
r = rand*sum
h.each do |key,value|
r -= value
return key if r <= 0
end
Util.random_select(h.keys)
end
def Util.random_select(ary)
ary[rand(ary.size)]
end
def Util.message_normalize(str)
paren_h = {}
%w(「」 『』 () ()).each do |paren|
paren.scan(/./) do |ch|
paren_h[ch] = paren.scan(/./)
end
end
re = /[「」『』()()]/
ary = str.scan(re)
cnt = 0
paren = ''
str2 = str.gsub(re) do |ch|
res = if cnt == ary.size-1 && ary.size % 2 == 1
''
elsif cnt % 2 == 0
paren = paren_h[ch][1]
paren_h[ch][0]
else
paren
end
cnt += 1
res
end
str2.gsub!(/「」/,'')
str2.gsub!(/()/,'')
str2.gsub!(/『』/,'')
str2.gsub!(/\(\)/,'')
str2
end
def Util.markov(src,keywords,trie)
mar = markov_generate(src,trie)
result = markov_select(mar,keywords)
result
end
MarkovKeySize = 2
def markov_generate(src,trie)
return '' if src.size == 0
ary = trie.split_into_terms(src.join("\n")+"\n",true)
size = ary.size
ary.concat(ary[0,MarkovKeySize+1])
table = {}
size.times do |idx|
key = ary[idx,MarkovKeySize]
table[key] = [] unless table.key?(key)
table[key] << ary[idx+MarkovKeySize]
end
uniq = {}
backup = {}
table.each do |k,v|
if v.size == 1
uniq[k] = v[0]
else
backup[k] = table[k].dup
end
end
key = ary[0,MarkovKeySize]
result = key.join('')
10000.times do
if uniq.key?(key)
str = uniq[key]
else
table[key] = backup[key].dup if table[key].size == 0
idx = rand(table[key].size)
str = table[key][idx]
table[key][idx] = nil
table[key].compact!
end
result << str
key = (key.dup << str)[1,MarkovKeySize]
end
result
end
def markov_split(str)
result = []
while /\A(.{25,}?)([。、.,]+|[?!.,]+[\s ])[  ]*/.match(str)
match = Regexp.last_match
m = match[1]
m += match[2].gsub(/、/,'。').gsub(/,/,'.') if match[2]
result << m
str = match.post_match
end
result << str if str.size > 0
result
end
def markov_select(result, keywords)
tmp = result.split(/\n/) || ['']
result_ary = tmp.collect { |str| markov_split(str) }.flatten.uniq
result_ary.delete_if{|a| a.size == 0 || /\0/.match(a) }
result_hash = {}
trie = Trie.new(keywords.keys)
result_ary.each do |str|
terms = trie.split_into_terms(str).uniq
result_hash[str] = terms.collect{ |kw| keywords[kw] }.sum || 0
end
if $DEBUG
sum = result_hash.values.sum.to_f
tmp = result_hash.sort_by{ |k,v| [-v,k] }
puts "-(候補数: #{result_hash.size})----"
tmp[0,10].each do |k,v|
printf("%5.2f%%: %s\n", v/sum*100, k)
end
end
result = Util.roulette_select(result_hash)
result || ''
end
module_function :markov_select, :markov_generate, :markov_split
end
class Core
attr_accessor :dic
# def initialize(dirname)
# @dic = Dictionary.load(dirname)
# end
def initialize(dic)
@dic = dic
end
def talk(str=nil,weight={})
if str
keywords = @dic.split_into_keywords(str)
else
text = @dic.text
latest_text = if text.size < 10 then text else text[-10..-1] end
keywords = Hash.new(0)
latest_text.each do |str|
keywords.each { |k,v| keywords[k] *= 0.5 }
@dic.split_into_keywords(str).each { |k,v| keywords[k] += v }
end
end
weight.keys.each do |kw|
if keywords.key?(kw)
if weight[kw] == 0
keywords.delete(kw)
else
keywords[kw] *= weight[kw]
end
end
end
msg = message_markov(keywords)
if $DEBUG
sum = keywords.values.sum
tmp = keywords.sort_by{|k,v| [-v,k] }
puts "-(term)----"
tmp.each do |k,v|
printf " %s(%6.3f%%), ", k, v/sum*100
end
puts "\n----------"
end
msg
end
def memorize(lines)
@dic.store_text(lines)
if @dic.learn_from_text
@dic.save_dictionary
end
end
def message_markov(keywords)
lines = []
if keywords.size > 0
if keywords.size > 10
keywords.sort_by{|k,v| -v}[10..-1].each do |k,v|
keywords.delete(k)
end
end
sum = keywords.values.sum
if sum > 0
keywords.each { |k,v| keywords[k] = v/sum }
end
keywords.keys.collect do |kw|
ary = @dic.lines(kw).sort_by{ rand }
ary[0,10].each do |idx|
lines << idx
end
end.flatten
end
10.times { lines << rand(@dic.text.size) }
lines.uniq!
source = lines.collect do |k,v|
@dic.text[k,5]
end.sort_by{ rand }.flatten.compact.uniq
msg = Util.markov(source, keywords, @dic.trie)
msg = Util.message_normalize(msg)
msg
end
end
class Freq
def Freq.extract_terms(buf,limit)
Freq.new(buf).extract_terms(limit)
end
def initialize(buf)
buf = buf.join("\0") if buf.kind_of?(Array)
@buf = buf
end
def extract_terms(limit)
terms = extract_terms_sub(limit)
terms = terms.collect {|t,n| [t.reverse.strip,n] }.sort
terms2 = []
(terms.size-1).times do |idx|
if terms[idx][0].size >= terms[idx+1][0].size ||
terms[idx][0] != terms[idx+1][0][0,terms[idx][0].size]
terms2 << terms[idx]
elsif terms[idx][1] >= terms[idx+1][1] + 2
terms2 << terms[idx]
end
end
terms2 << terms[-1] if terms.size > 0
terms2.collect {|t,n| t.reverse }
end
def extract_terms_sub(limit,str='',num=1,width=false)
h = freq(str)
flag = (h.size <= 4)
result = []
if limit > 0
h.delete(str) if h.key?(str)
h.to_a.delete_if { |k,v| v < 2 }.sort.each do |k,v|
result.concat( extract_terms_sub(limit-1, k, v, flag) )
end
end
if result.size == 0 && width
return [[str.downcase,num]]
end
result
end
def freq(str)
freq = Hash.new(0)
if str.size == 0
regexp = /([!-~])[!-~]*|([ァ-ヴ])[ァ-ヴー]*|([^ー\0])/i
@buf.scan(regexp) { |ary| freq[ary[0] || ary[1] || ary[2]] += 1 }
else
regexp = /#{Regexp.quote(str)}[^\0]?/i
@buf.scan(regexp) { |str| freq[str] += 1 }
end
freq
end
end
class Trie
def initialize(ary=nil)
@root = {}
if ary
ary.each { |elm| self.add(elm) }
end
end
def add(str)
node = @root
str.each_byte do |b|
node[b] = {} unless node.key?(b)
node = node[b]
end
node[:terminate] = true
end
def member?(str)
node = @root
str.each_byte do |b|
return false unless node.key?(b)
node = node[b]
end
node.key?(:terminate)
end
def members
members_sub(@root)
end
def members_sub(node,str='')
node.collect do |k,v|
if k == :terminate
str
else
members_sub(v,str+k.chr)
end
end.flatten
end
private :members_sub
def split_into_terms(str,num=nil)
result = []
return result unless str
while str.size > 0 && ( !num.kind_of?(Numeric) || result.size < num )
prefix = longest_prefix_subword(str)
if prefix
result << prefix
str = str[prefix.size..-1]
else
chr = /./m.match(str)[0]
result << chr if num
str = Regexp.last_match.post_match
end
end
result
end
def longest_prefix_subword(str)
node = @root
result = nil
idx = 0
str.each_byte do |b|
result = str[0,idx] if node.key?(:terminate)
return result unless node.key?(b)
node = node[b]
idx += 1
end
result = str if node.key?(:terminate)
result
end
def delete(str)
node = @root
ary = []
str.each_byte do |b|
return false unless node.key?(b)
ary << [node,b]
node = node[b]
end
return false unless node.key?(:terminate)
ary << [node,:terminate]
ary.reverse.each do |node,b|
node.delete(b)
break unless node.empty?
end
true
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment