vijedi/sentence_util.rb

## sentence_util.rb
module ViJedi::SentenceUtil
    SIMILARITY_PERCENTAGE = 0.4

    # boolean check to see if the source and prime sentences are the same
    def self.similar(source, prime)
        return compute_values(source, prime)[:mismatch_percentage] < SIMILARITY_PERCENTAGE
    end

    # returns the word difference between the source and prime
    def self.word_distance(source, prime)
        compute_values(source, prime)[:distance]
    end

    private

    # returns an array representing the stemmed words from the string
    def self.tokenize(sentence)
        tokens = sentence.scan(/[\w']+/)
        tokens.collect do |token|
            # Stem each of the tokens using the Text gem
            Text::PorterStemming.stem(token)
        end
    end

    # returns a unicode string of the bytes representing the tokens
    def self.encode(tokens, token_map)
        bytes = tokens.collect do |token|
            token_map[token]
        end

        bytes.pack('U*')
    end

    # returns a hash containing the distance and the percent of mismatch
    def self.compute_values(source, prime)
        source_tokens = (source.present?) ? tokenize(source.downcase) : []
        prime_tokens = (prime.present?) ? tokenize(prime.downcase) : []

        # assign a numerical value to each unique word
        token_map = {}
        (source_tokens + prime_tokens).each do |token|
            token_map[token] ||= token_map.length
        end

        # just give up if the number of words is too large
        if token_map.length > 255
            return {
                distance: -1,
                mismatch_percentage: 1
            }
        end

        encoded_source = encode(source_tokens, token_map)
        encoded_prime = encode(prime_tokens, token_map)

        distance = Text::Levenshtein.distance(encoded_source, encoded_prime)

        # since the maximum Levenshtein distance is the length of the longest word,
        # use the longest word are the base to compute the distance.
        mismatch_percentage = distance.to_f / ([encoded_source.length, encoded_prime.length].max)

        return {
            distance: distance,
            mismatch_percentage: mismatch_percentage
        }
    end
end
	module ViJedi::SentenceUtil
	SIMILARITY_PERCENTAGE = 0.4

	# boolean check to see if the source and prime sentences are the same
	def self.similar(source, prime)
	return compute_values(source, prime)[:mismatch_percentage] < SIMILARITY_PERCENTAGE
	end

	# returns the word difference between the source and prime
	def self.word_distance(source, prime)
	compute_values(source, prime)[:distance]
	end

	private

	# returns an array representing the stemmed words from the string
	def self.tokenize(sentence)
	tokens = sentence.scan(/[\w']+/)
	tokens.collect do \|token\|
	# Stem each of the tokens using the Text gem
	Text::PorterStemming.stem(token)
	end
	end

	# returns a unicode string of the bytes representing the tokens
	def self.encode(tokens, token_map)
	bytes = tokens.collect do \|token\|
	token_map[token]
	end

	bytes.pack('U*')
	end

	# returns a hash containing the distance and the percent of mismatch
	def self.compute_values(source, prime)
	source_tokens = (source.present?) ? tokenize(source.downcase) : []
	prime_tokens = (prime.present?) ? tokenize(prime.downcase) : []

	# assign a numerical value to each unique word
	token_map = {}
	(source_tokens + prime_tokens).each do \|token\|
	token_map[token] \|\|= token_map.length
	end

	# just give up if the number of words is too large
	if token_map.length > 255
	return {
	distance: -1,
	mismatch_percentage: 1
	}
	end

	encoded_source = encode(source_tokens, token_map)
	encoded_prime = encode(prime_tokens, token_map)

	distance = Text::Levenshtein.distance(encoded_source, encoded_prime)

	# since the maximum Levenshtein distance is the length of the longest word,
	# use the longest word are the base to compute the distance.
	mismatch_percentage = distance.to_f / ([encoded_source.length, encoded_prime.length].max)

	return {
	distance: distance,
	mismatch_percentage: mismatch_percentage
	}
	end
	end