Skip to content

Instantly share code, notes, and snippets.

@shouya
Created June 28, 2015 21:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shouya/9ebfdcaf3a814a6debf3 to your computer and use it in GitHub Desktop.
Save shouya/9ebfdcaf3a814a6debf3 to your computer and use it in GitHub Desktop.
a stupid bigram based segmentizer
def clean(txt)
mark = true
txt.each_char.map { |x|
if x.ascii_only? and mark
mark = false
' '
elsif x.ascii_only?
nil
else
mark = true
x
end
}.compact.map { |x|
x.nil? ? ' ' : x
}.join
end
def bigram(str)
str.split.map {|x| x.each_char.each_cons(2).to_a }.flatten(1)
end
def stat(bigram)
tbl = Hash.new
bigram.to_a.each do |a,b|
tbl[a] ||= Hash.new(0)
tbl[a][b] += 1
end
tbl
end
def maxfreq(tbl)
max = 0
tbl.each do |k1,v1|
v1.each do |k2,v2|
max = v2 if max < v2
end
end
max
end
def normalize(len, maxfreq, tbl)
new_tbl = Hash.new
rec = Math.log(len)
norm = Math.log(maxfreq)/rec
tbl.each do |k1,v1|
v1.each do |k2,v2|
new_tbl[k1] ||= Hash.new(0)
reced = Math.log(v2.to_f)/rec
new_tbl[k1][k2] = reced/norm
end
end
new_tbl
end
def segm(tbl, txt, prec)
segs = []
buf = []
txt.each_char.each_cons(2) do |c1, c2|
buf << c1 unless c1 == ' '
if tbl[c1].nil? or tbl[c1][c2] < prec
segs << buf.join
buf = []
end
end
segs.reject {|x| x.length == 0 }
end
require_relative 'ngram'
require 'json'
txt = clean(File.read('tweets.txt'))
cons = bigram(txt)
tbl = stat(cons)
max = maxfreq(tbl)
tbl = normalize(txt.length, max, tbl)
segs = segm(tbl, txt, 0.2).reject {|x| x.length <= 1 }
File.write('./out.json', segs.to_json)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment