Created
February 28, 2016 03:30
-
-
Save komasaru/41b0c93e264be75eabfa to your computer and use it in GitHub Desktop.
Ruby script to check a degree of string similarity by n-gram model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
# coding: utf-8 | |
#******************************************************************** | |
# Ruby script to check a degree of string similarity by n-gram model. | |
#******************************************************************** | |
# | |
class String | |
def sim_ngram(str, n = 3) | |
# 空白文字(半角スペース、改行、復帰、改ページ、水平タブ)は除去 | |
strings = [self.gsub(/\s+/, ""), str.gsub(/\s+/, "")] | |
lengths = strings.map { |s| s.split(//).size } | |
# 文字列の文字数が N より少なければ例外スロー | |
raise "Length of a self string is shorter than N(=#{n})" if lengths[0] < n | |
raise "Length of a target string is shorter than N(=#{n})" if lengths[1] < n | |
# N 文字ずつ分割 | |
arrays = strings.map { |s| s.chars.each_cons(n).collect(&:join) } | |
# 重複要素数 | |
count_dup = (arrays[0] & arrays[1]).size | |
# 全要素数 | |
count_all = (arrays[0] + arrays[1]).uniq.size | |
# 類似度返却 | |
return count_dup / count_all.to_f | |
end | |
end | |
str_1 = "良識はこの世のものでもっとも公平に配分されている" | |
str_2 = "良識はこの世でもっとも公平に分け与えられているものである" | |
puts "文1:#{str_1}" | |
puts "文2:#{str_2}" | |
puts "類似度(1-gram): #{str_1.sim_ngram(str_2, 1)}" | |
puts "類似度(2-gram): #{str_1.sim_ngram(str_2, 2)}" | |
puts "類似度(3-gram): #{str_1.sim_ngram(str_2)}" | |
puts "類似度(4-gram): #{str_1.sim_ngram(str_2, 4)}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment