Skip to content

Instantly share code, notes, and snippets.

@brianmcgue
Last active May 14, 2020 01:13
Show Gist options
  • Save brianmcgue/3b46fac075dc14adc44184fdff133eac to your computer and use it in GitHub Desktop.
Save brianmcgue/3b46fac075dc14adc44184fdff133eac to your computer and use it in GitHub Desktop.
An attempt to determine if how correlated the results are for 5.x and 7.x schema
require 'json'
require 'elastic-site-search' # gem install elastic-site-search
require 'pry'
class RankingComparer
API_ENDPOINT = 'http://localhost:3002/api/v1/'
API_KEY = 'uQTxqyzMRxYMXgnnxVU1'
PAGE_SIZE = 40
attr_reader :base_engine_name, :new_engine_name
def client
return @client if defined?(@client)
Elastic::SiteSearch.api_key = API_KEY
Elastic::SiteSearch.endpoint = API_ENDPOINT
@client = Elastic::SiteSearch::Client.new
end
def initialize(base_engine_name, new_engine_name)
@base_engine_name = base_engine_name
@new_engine_name = new_engine_name
validate_engines!
end
def validate_engines!
available_engines = client.engines.map { |engine| engine['slug'] }
raise "#{base_engine_name} is not an available engine" unless available_engines.include?(base_engine_name)
raise "#{new_engine_name} is not an available engine" unless available_engines.include?(new_engine_name)
end
def massaged_results(engine_name, query, page: 1)
results = client.search(engine_name, query, :page => page, :per_page => PAGE_SIZE)
results.records['page'].each_with_index.each_with_object({}) do |(result, idx), memo|
page_offset = (PAGE_SIZE * (page - 1))
memo[result['title'].hash] = idx + 1 + page_offset
end
end
def compare_kendall_order(result1, result2)
if result1.nil?
if result2.nil?
0
else
-1
end
elsif result2.nil?
1
else
result1 > result2 ? 1 : -1
end
end
# based on https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
def kendall_rank_correlation(query)
base_results = massaged_results(base_engine_name, query)
new_results = massaged_results(new_engine_name, query)
return "No results in the base engine for #{query.inspect}" if base_results.empty?
return "No results in the new engine for #{query.inspect}" if new_results.empty?
new_results_page_2 = massaged_results(new_engine_name, query, :page => 2)
new_results.merge!(new_results_page_2)
combinations = base_results.keys.combination(2)
concordant_pairs = 0
discordant_pairs = 0
combinations.each do |title_hash1, title_hash2|
result1_base = base_results[title_hash1]
result2_base = base_results[title_hash2]
result1_new = new_results[title_hash1]
result2_new = new_results[title_hash2]
result1 = compare_kendall_order(result1_base, result2_base)
result2 = compare_kendall_order(result1_new, result2_new)
if result1.zero? || result2.zero?
# this is an attempt to make things easier... it means both results from the
# base set are not in the top two pages for the new result sets. Technically,
# they could still be concordant if they're in the same order, but if they're
# not in the first two pages, I think it makes sense to "dock" points and
# error on the side of worse correlation and count it as a discordant pair.
# The other option is to not count it at all.
discordant_pairs += 1
elsif result1 == 1 && result2 == 1
concordant_pairs += 1
elsif result1 == -1 && result2 == -1
concordant_pairs += 1
else
discordant_pairs += 1
end
end
numerator = concordant_pairs - discordant_pairs
numerator / combinations.size.to_f
end
# based on https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
def spearman_rank_correlation(query)
base_results = massaged_results(base_engine_name, query)
new_results = massaged_results(new_engine_name, query)
return "No results in the base engine for #{query.inspect}" if base_results.empty?
return "No results in the new engine for #{query.inspect}" if new_results.empty?
new_results_page_2 = massaged_results(new_engine_name, query, :page => 2)
new_results = new_results_page_2.merge(new_results)
new_results_page_3 = massaged_results(new_engine_name, query, :page => 3)
new_results = new_results_page_3.merge(new_results)
diff_square_sum = base_results.sum do |title_hash, base_rank|
new_rank = new_results[title_hash]
if new_rank.nil?
return <<-ERROR.gsub(/\s+/, ' ').strip
Result ##{base_rank} in base engine is not ranked
in the top #{new_results.size} for new engine
ERROR
end
diff = base_rank - new_rank
diff * diff
end
# previously, I was using the "worst_new_rank" to determine the denominator
# because if a result from the first set isn't in the first page of the
# second set, then just using the page size wouldn't accurately reflect
# the number of items that we were looking at. Technically, however,
# neither does "worst_new_rank"... and now I'm switching to using the page
# size because it makes the denominator smaller and thus makes the fraction
# larger, so we'll error on the side of caution
denominator = base_results.size * ((base_results.size ** 2) - 1)
numerator = 6 * diff_square_sum
1 - (numerator / denominator.to_f)
end
end
# ranking_comparer = RankingComparer.new('satisfy090', 'satisfy-running')
ranking_comparer = RankingComparer.new('satisfy-running', 'satisfy7')
puts 'Kendall:'
puts 'single word:'
puts "#{ranking_comparer.kendall_rank_correlation('shorts')} 'shorts'"
puts "#{ranking_comparer.kendall_rank_correlation('singlet')} 'singlet'"
puts "#{ranking_comparer.kendall_rank_correlation('shirt')} 'shirt'"
puts "#{ranking_comparer.kendall_rank_correlation('cotton')} 'cotton'"
puts "#{ranking_comparer.kendall_rank_correlation('jacket')} 'jacket'"
puts "#{ranking_comparer.kendall_rank_correlation('hat')} 'hat'"
puts "#{ranking_comparer.kendall_rank_correlation('technology')} 'technology'"
puts 'more descriptive:'
puts "#{ranking_comparer.kendall_rank_correlation('long trail shorts')} 'long trail shorts'"
puts "#{ranking_comparer.kendall_rank_correlation('moth eaten shirt')} 'moth eaten shirt'"
puts "#{ranking_comparer.kendall_rank_correlation('short distance 8')} 'short distance 8'"
puts
puts 'Spearman:'
puts 'single word:'
puts "#{ranking_comparer.spearman_rank_correlation('shorts')} 'shorts'"
puts "#{ranking_comparer.spearman_rank_correlation('singlet')} 'singlet'"
puts "#{ranking_comparer.spearman_rank_correlation('shirt')} 'shirt'"
puts "#{ranking_comparer.spearman_rank_correlation('cotton')} 'cotton'"
puts "#{ranking_comparer.spearman_rank_correlation('jacket')} 'jacket'"
puts "#{ranking_comparer.spearman_rank_correlation('hat')} 'hat'"
puts "#{ranking_comparer.spearman_rank_correlation('technology')} 'technology'"
puts 'more descriptive:'
puts "#{ranking_comparer.spearman_rank_correlation('long trail shorts')} 'long trail shorts'"
puts "#{ranking_comparer.spearman_rank_correlation('moth eaten shirt')} 'moth eaten shirt'"
puts "#{ranking_comparer.spearman_rank_correlation('short distance 8')} 'short distance 8'"
### OUTPUT 5.x => 7.x
# Kendall:
# single word:
# 0.839572192513369 'shorts'
# 0.9563025210084034 'singlet'
# 0.7411095305832148 'shirt'
# 0.4789915966386555 'cotton'
# 0.3 'jacket'
# 0.26666666666666666 'hat'
# 0.6096096096096096 'technology'
# more descriptive:
# 0.5334281650071123 'long trail shorts'
# 0.5585585585585585 'moth eaten shirt'
# 0.7927927927927928 'short distance 8'
#
# Spearman:
# single word:
# 0.9413292589763178 'shorts'
# 0.9911764705882353 'singlet'
# 0.7794069373016741 'shirt'
# 0.3680672268907563 'cotton'
# 0.37570356472795496 'jacket'
# 0.6183864915572233 'hat'
# 0.7051920341394026 'technology'
# more descriptive:
# 0.6456942772732246 'long trail shorts'
# 0.7178757705073495 'moth eaten shirt'
# 0.8307254623044097 'short distance 8'
# ## OUTPUT 0.90 => 5.x
# Kendall:
# single word:
# 0.36541889483065954 'shorts'
# 0.8050420168067227 'singlet'
# 0.8122332859174964 'shirt'
# 0.6893939393939394 'cotton'
# 0.5294117647058824 'jacket'
# 0.9738562091503268 'hat'
# 0.7207207207207207 'technology'
# more descriptive:
# 0.2857142857142857 'long trail shorts'
# -0.2132132132132132 'moth eaten shirt'
# 0.18618618618618618 'short distance 8'
#
# Spearman:
# single word:
# 0.2551566080977846 'shorts'
# 0.8061624649859944 'singlet'
# 0.8582995951417004 'shirt'
# 0.6649398395721925 'cotton'
# 0.826625386996904 'jacket'
# 0.9958720330237358 'hat'
# 0.8518255097202465 'technology'
# more descriptive:
# Result #23 in base engine is not ranked in the top 113 for new engine 'long trail shorts'
# Result #37 in base engine is not ranked in the top 57 for new engine 'moth eaten shirt'
# 0.02987197724039825 'short distance 8'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment