Skip to content

Instantly share code, notes, and snippets.

@shanna
Last active February 8, 2017 14:33
Show Gist options
  • Save shanna/dee6abbbed476f27b391a6945e382cb1 to your computer and use it in GitHub Desktop.
Save shanna/dee6abbbed476f27b391a6945e382cb1 to your computer and use it in GitHub Desktop.
Jaccard coeff against shingles.
require 'shingles'
compare = %w{
foo foo
foo bar
baz boz
foofoo foobar
}.each_slice(2)
compare.each do |a, b|
puts 'jaccard_distance: %s ~ %s == %f' % [a, b, a.jaccard_distance(b)]
end
require 'set'
#==== See
# * http://github.com/matpalm/resemblance
class String
module Shingles
N_GRAM_LENGTH = 3
def shingles
@shingles ||= -> do
str = dup.to_s.downcase
n_grams = Set.new
(length - N_GRAM_LENGTH + 1).times{|i| n_grams << str.slice(i, N_GRAM_LENGTH)}
n_grams
end.call
end
def jaccard_similarity_coeff(b)
sa = shingles
sb = b.shingles
numerator = (sa.intersection sb).size
denominator = (sa.union sb).size
numerator.to_f / denominator
end
def jaccard_distance(b)
xor = 0
union = 0
shingles.union(b.shingles).each do |shingle|
in_a = shingles.include? shingle
in_b = b.shingles.include? shingle
xor += 1 if in_a ^ in_b
union += 1 if in_a & in_b
end
xor.to_f / (xor + union)
end
def invalidate_cache
@shingles = nil
end
end
include Shingles
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment