Created
November 26, 2014 03:41
-
-
Save vijedi/200b741630b14aa09fa7 to your computer and use it in GitHub Desktop.
Algorithm for comparing sentences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module ViJedi::SentenceUtil | |
SIMILARITY_PERCENTAGE = 0.4 | |
# boolean check to see if the source and prime sentences are the same | |
def self.similar(source, prime) | |
return compute_values(source, prime)[:mismatch_percentage] < SIMILARITY_PERCENTAGE | |
end | |
# returns the word difference between the source and prime | |
def self.word_distance(source, prime) | |
compute_values(source, prime)[:distance] | |
end | |
private | |
# returns an array representing the stemmed words from the string | |
def self.tokenize(sentence) | |
tokens = sentence.scan(/[\w']+/) | |
tokens.collect do |token| | |
# Stem each of the tokens using the Text gem | |
Text::PorterStemming.stem(token) | |
end | |
end | |
# returns a unicode string of the bytes representing the tokens | |
def self.encode(tokens, token_map) | |
bytes = tokens.collect do |token| | |
token_map[token] | |
end | |
bytes.pack('U*') | |
end | |
# returns a hash containing the distance and the percent of mismatch | |
def self.compute_values(source, prime) | |
source_tokens = (source.present?) ? tokenize(source.downcase) : [] | |
prime_tokens = (prime.present?) ? tokenize(prime.downcase) : [] | |
# assign a numerical value to each unique word | |
token_map = {} | |
(source_tokens + prime_tokens).each do |token| | |
token_map[token] ||= token_map.length | |
end | |
# just give up if the number of words is too large | |
if token_map.length > 255 | |
return { | |
distance: -1, | |
mismatch_percentage: 1 | |
} | |
end | |
encoded_source = encode(source_tokens, token_map) | |
encoded_prime = encode(prime_tokens, token_map) | |
distance = Text::Levenshtein.distance(encoded_source, encoded_prime) | |
# since the maximum Levenshtein distance is the length of the longest word, | |
# use the longest word are the base to compute the distance. | |
mismatch_percentage = distance.to_f / ([encoded_source.length, encoded_prime.length].max) | |
return { | |
distance: distance, | |
mismatch_percentage: mismatch_percentage | |
} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment