Skip to content

Instantly share code, notes, and snippets.

@softwaredoug
Last active June 12, 2022 12:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save softwaredoug/c87718718b7f0ecfeff429fcca1d7390 to your computer and use it in GitHub Desktop.
Save softwaredoug/c87718718b7f0ecfeff429fcca1d7390 to your computer and use it in GitHub Desktop.
# Can we infer relevance grades from just a difference in the mean NDCG of two samples?
import pandas as pd
import numpy as np
import random
import itertools
from statistics import NormalDist
# Submissions from the kaggle vmware competition
# NDCG at 5
ndcgs = {
'use_feedback_rrf_turnbull_submission_1653226391886872.csv': 0.16806,
'pull_out_firstline_turnbull_1653253060074837.csv': 0.29668,
'turnbull_submission_1652544680901428.csv': 0.20911,
'with_best_compounds_at_50_plus_10_times_use_turnbull_165445182567455.csv': 0.32681,
'with_best_compounds_at_5_only_phrase_search_turnbull_1654439995765457.csv': 0.31643,
'rerank_slop_search_remaining_lines_max_snippet_at_5_turnbull_1654439885030507.csv': 0.31574,
'simulated_labels_turnbull.csv': 0.26169,
'simulated_labels_turnbull_2.csv': 0.28722,
# Random noise we assume has NDCG=0
'noise.csv': 0.0
}
# Ideeal DCG@5 with weight 1 / log(n+1), if n is 1 based position
ideal_dcg_at_5 = 2.948459119
# one unit of NDCG == ideal_dcg units of DCG
#
# total DCG change =
# NDCG * ideal_dcg * num_queries
#
# 0-1 total possible sales (1 highest possible)
# ideal sales number
#
# what if this was an A/B test instead of NDCG?
# - assume some weighted correlation of position ranking to A/B testing outcome
# - we could somehow know those weights (ie best item in position 1 nets 5 sales. But in position 2 nets 3)
# - given total sales between two rankings
# -
#
# (note this -- imperfectly -- forces the NDCG from the source system into our DCG scaling)
#
# For every new result, with DCG position weight wn. Each result has gone from position wn -> wm
#
# For every moved search result with a relevance grade g, we can see the change in DCG as follows
#
# delta_dcg = (wn - wm) * g + ... (for every changed query/doc pair)
#
# Where we assume grade g is either 0 (irrelevant) or 1 (relevant)
#
# Perfect case, only permutation that creates this is when they're all 1
#
# max_delta_dcg = sum(wn - wm) for all wn > wm
#
# When we detect an actuall delta_dcg, we can randomly select grades 0 and 1 to each doc to see
# which ones most approximate the delta dcg
def sign(x):
return -1.0 if x < 0.0 else 1.0
def biased_random_sample(sample_size, prob_of_relevance=0.9):
sample = np.random.random_sample(sample_size)
biased_sample = sample.copy()
biased_sample[sample < prob_of_relevance] = 1
biased_sample[sample >= prob_of_relevance] = 0
return biased_sample.astype(int)
def add_weights(results):
"""Add a position. Compute DCG weights"""
results['position'] = results.groupby('QueryId').cumcount() + 1
results['weight'] = 1 / np.log2(results['position'] + 1)
return results
def create_results_diff(results_before, results_after):
"""Compute the DCG delta weights resulting from the before and after,
so we can compare to observed mean delta dcg."""
results_before = add_weights(results_before)
results_after = add_weights(results_after)
results = results_before.merge(results_after,
on=['QueryId', 'DocumentId'],
how='outer')
results = results.rename(
columns={'position_x': 'position_before',
'position_y': 'position_after',
'weight_x': 'weight_before',
'weight_y': 'weight_after'}
)
# For each document, its DCG weight before and aftter
results['weight_after'] = results['weight_after'].replace(np.nan, 0)
results['weight_before'] = results['weight_before'].replace(np.nan, 0)
results['weight_delta'] = results['weight_after'] - results['weight_before']
results['position_delta'] = results['position_after'] - results['position_before']
results['weight_delta_abs'] = np.abs(results['weight_delta'])
return results
def simulate_at(results, prob_positive, actual_dcg_delta, rounds=10,
normalize=True, verbose=False):
"""Simulate by giving each doc moved up a grade of 1 with provided probabiility."""
best_universe_prob = 0.0
best_prob_positive = prob_positive
for i in range(0, rounds):
# Assign the items with a positive weight delta (moved UP) a relevance of 1
# with probability `prob_positive` (and conversely for negatives)
rand_grades_positive = biased_random_sample(len(results[results['weight_delta'] > 0]),
prob_of_relevance=prob_positive)
rand_grades_negative = biased_random_sample(len(results[results['weight_delta'] < 0]),
prob_of_relevance=1.0 - prob_positive)
results['grade'] = 0
results.loc[results['weight_delta'] > 0, 'grade'] = rand_grades_positive
results.loc[results['weight_delta'] < 0, 'grade'] = rand_grades_negative
# DCG delta of this simulated universe - how close is it to the observed DCG delta?
simulated_dcg_delta = sum(results['grade'] * results['weight_delta'])
# Measure how much the the actual / simulated distributions overlap
# to figure out how 'real' the simulated one is
actual_universe_distribution = NormalDist(mu=actual_dcg_delta, sigma=2)
simulated_universe_distribution = NormalDist(mu=simulated_dcg_delta, sigma=2)
universe_prob = actual_universe_distribution.overlap(simulated_universe_distribution)
# Increment alpha and beta in proportion to probability of the universe being real
results.loc[(results['grade'] == 1) & (results['weight_delta'] != 0), 'alpha'] += universe_prob
results.loc[(results['grade'] == 0) & (results['weight_delta'] != 0), 'beta'] += universe_prob
# Move probability of drawing positive (the prob weights > 0 have relevance=1) in
# a direction closer to making the simulated relevance universe more likely
delta = actual_dcg_delta - simulated_dcg_delta
update = 0.01 * (1 - (universe_prob**0.0000001)) * sign(delta)
prob_positive += update
if universe_prob > best_universe_prob:
best_universe_prob = universe_prob
best_prob_positive = prob_positive
if verbose:
msg = f"Sim: {simulated_dcg_delta:.2f}, Act: {actual_dcg_delta:.2f}, Prob: {universe_prob:.3f} "
msg += f"| Upd {update}, Draw {prob_positive:.3f} | Best {best_prob_positive:.3f} {best_universe_prob:.3f}"
print(msg)
if normalize:
# Scale alpha and beta to not be overconfident by just having more rounds,
# scale to 1/10th of the number of rounds. 1/10th is an arbitrary number :)
# Alternatively, we could check whether we're encountering highly similar universes and discount
# by that similarity, but that would be complex and expensive
results['alpha'] /= (rounds / 10)
results['beta'] /= (rounds / 10)
return results
def main():
judgments = pd.DataFrame(columns=['QueryId', 'DocumentId', 'alpha', 'beta',
'weight_delta', 'position_delta'])
runs = 0
num_simulations = 1000
for results_before, results_after in itertools.combinations(ndcgs.keys(), 2):
if results_before == results_after:
continue
print(results_before, results_after, num_simulations)
mean_ndcg_diff = ndcgs[results_after] - ndcgs[results_before]
results_before = pd.read_csv(results_before)
results_after = pd.read_csv(results_after)
results_diff = create_results_diff(results_before, results_after)
# Translate our NDCG@5 to a DCG@5 to simplify the simulation
actual_dcg_delta = len(results_diff['QueryId'].unique()) * mean_ndcg_diff * ideal_dcg_at_5
# Very weak prior, mean of 0.3
if 'alpha' not in results_diff.columns:
results_diff.loc[:, 'alpha'] = 0.01
results_diff.loc[:, 'beta'] = 0.02
prob_positive = random.uniform(0.24, 0.80)
results = simulate_at(results_diff, prob_positive,
rounds=num_simulations,
actual_dcg_delta=actual_dcg_delta,
verbose=False)
# Accumulate judgments from this pair into the evaluation
results = results.groupby(['QueryId', 'DocumentId'])[['alpha', 'beta', 'weight_delta', 'position_delta']].sum()
judgments = pd.concat([judgments, results])
judgments = \
judgments.groupby(['QueryId', 'DocumentId'])[['alpha', 'beta', 'weight_delta', 'position_delta']].sum()
print(len(results), '->', len(judgments))
runs += 1
# Runs likely repeat information between them. How do we
# account for their indpendence (they are not entirely indepnedent)
# judgments['alpha'] /= math.log(runs)
# judgments['beta'] /= math.log(runs)
# Compute a grade using alpha and beta
judgments['grade'] = judgments['alpha'] / (judgments['alpha'] + judgments['beta'])
# Join with corpus for debugging
corpus = pd.read_csv('vmware_ir_content.csv.zip', compression='zip')
queries = pd.read_csv('test.csv')
judgments = judgments.reset_index()
results = judgments.merge(corpus, right_on='f_name', left_on='DocumentId', how='left')
results = results.merge(queries, on='QueryId', how='left')
results.to_csv('simulated_results.csv', index=False)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment