Skip to content

Instantly share code, notes, and snippets.

@mooreniemi
Created October 6, 2019 22:36
Show Gist options
  • Save mooreniemi/01158d16d0a6bd64cdc94404e933395a to your computer and use it in GitHub Desktop.
Save mooreniemi/01158d16d0a6bd64cdc94404e933395a to your computer and use it in GitHub Desktop.
from heapq import nsmallest, nlargest, heappush, heapreplace
from random import randrange, seed
from copy import deepcopy
# Taken directly from https://trevorcohn.github.io/comp90042/slides/WSTA_L3_IR.pdf 26+
p_lists = { 'the': [2,3,7,8,9,10,11,12,13,17,18,19],
'quick': [5,6,11,14,18],
'brown': [2,4,5,15,42,84,96],
'fox': [5,7,8,13] }
all_doc_ids = set()
for _, doc_ids in p_lists.items():
[all_doc_ids.add(d) for d in doc_ids]
# setting k to this effectively turns wand off
total_doc_count = len(all_doc_ids)
# really this should be generated from the same mechanism as the scores
# if it was dependent properly, we could switch weights to 1 and see full or
# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.365.2939&rep=rep1&type=pdf
max_c = {'the': 0.9, 'quick': 1.9, 'brown': 2.3, 'fox': 7.1}
seed(42) # this takes the place of a real similarity function
def rand_score(t, d):
score = round(randrange(1, max_c[t]*10)*0.1)
print('"%s" score in doc_id %i: %i' % (t, d, score))
return score
pre_scored = { 'the': {}, 'quick': {}, 'brown': {}, 'fox': {} }
for t, ds in p_lists.items():
for d in ds:
pre_scored[t][d] = rand_score(t, d)
# for double checking
# scored_docs = {'the': {2: 0, 3: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 1, 17: 0, 18: 0, 19: 0}, 'quick': {5: 1, 6: 1, 11: 2, 14: 0, 18: 2}, 'brown': {2: 1, 4: 2, 5: 2, 15: 1, 42: 1, 84: 2, 96: 2}, 'fox': {5: 4, 7: 0, 8: 2, 13: 6}}
# assert(pre_scored == scored_docs, 'consistent scores')
# using scores from a pre_scored structure makes inspection easier
# so this is the scoring function used below
def fixed_score(t, doc_id):
score = pre_scored[t][doc_id]
print('fixed_score for "%s" and doc_id %i is %f' % (t, doc_id, score))
return score
def wand(q, k):
terms = q.split(' ')
enabled = k < total_doc_count # makes log messages easier etc
times_scored, times_skipped = 0, 0 # for comparison
scores = [] # this will be turned into a heap below
to_be_scored = deepcopy(p_lists) # I find it easier to read mutated structures
while True:
assert len(scores) <= k, 'we never store more scores than k'
# if we've run out of postings for a t, remove it from to_be_scored
to_be_scored = {k:v for k, v in to_be_scored.items() if v}
if not to_be_scored:
print('ran out of postings')
break
print('still scoring: %r' % to_be_scored)
# we sort by the postings list doc_id, if it doesn't exist we set it to max
to_be_scored = dict(sorted(to_be_scored.items(),
key=lambda kv: kv[1][0] if kv[1] else float('inf')))
current_doc_id = to_be_scored[list(to_be_scored.keys())[0]][0]
docs = dict(filter(lambda kv: current_doc_id in kv[1], to_be_scored.items()))
active_terms = [t for t, ds in docs.items()] # for inspection
max_score = sum([max_c[t] for t, ps in docs.items()])
print('max_score of %f for %r' % (max_score, active_terms))
try:
# this will never change if k is not set, limiting the heap
current_lowest = nsmallest(1, scores, key=lambda x: x[0])[0][0]
except IndexError as e:
# I'm being a bit lazy here with the heap initialization...
print('should only see this once... (%r)' % e)
current_lowest = 0
should_score = max_score > current_lowest if enabled else True
if should_score:
if enabled:
message = 'scoring because max score: %f > current_lowest: %f'
else:
message = 'scoring because max score: %f ignored, current_lowest: %f'
print(message % (max_score, current_lowest))
times_scored += 1
score = 0
for t, ps in docs.items():
score += fixed_score(t, current_doc_id)
to_be_scored[t].remove(current_doc_id) # once scored, don't process again
print('summed score was %f, lowest score was %f' % (score, current_lowest))
if score > current_lowest:
print('found a new score: %f' % score)
if len(scores) < k:
heappush(scores, [score, current_doc_id])
else:
heapreplace(scores, [score, current_doc_id])
else:
print('WAND skipped because max_score: %f < current_lowest: %f' % (max_score, current_lowest))
times_skipped += 1
for t, ps in docs.items():
to_be_scored[t].remove(current_doc_id)
print('WAND enabled? %r' % enabled)
print('times scored: %i, times skipped: %i' % (times_scored, times_skipped))
print('final scores: %r' % scores)
# below will return highest score and highest document id, which is fine for this example
top_score = nlargest(1, scores)
return top_score
q = 'the quick brown fox'
no_wand = wand(q, k=total_doc_count)
with_wand = wand(q, k=2)
print('\n' + ('#'*32) + ' RESULTS')
print('with_wand %r == no_wand %r' % (with_wand, no_wand))
# if you want to verify maximum iterations we should've seen
# print('total_doc_count: %i (compare to times_scored)' % total_doc_count)
# if you want to verify scores
# print('pre_scored docs: %r' % pre_scored)
@mooreniemi
Copy link
Author

Here's the output:

$ python3 wand_example.py
"the" score in doc_id 2: 0
"the" score in doc_id 3: 0
"the" score in doc_id 7: 0
"the" score in doc_id 8: 0
"the" score in doc_id 9: 0
"the" score in doc_id 10: 0
"the" score in doc_id 11: 0
"the" score in doc_id 12: 0
"the" score in doc_id 13: 1
"the" score in doc_id 17: 0
"the" score in doc_id 18: 0
"the" score in doc_id 19: 0
"quick" score in doc_id 5: 1
"quick" score in doc_id 6: 1
"quick" score in doc_id 11: 2
"quick" score in doc_id 14: 0
"quick" score in doc_id 18: 2
"brown" score in doc_id 2: 1
"brown" score in doc_id 4: 2
"brown" score in doc_id 5: 2
"brown" score in doc_id 15: 1
"brown" score in doc_id 42: 1
"brown" score in doc_id 84: 2
"brown" score in doc_id 96: 2
"fox" score in doc_id 5: 4
"fox" score in doc_id 7: 0
"fox" score in doc_id 8: 2
"fox" score in doc_id 13: 6
still scoring: {'the': [2, 3, 7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'quick': [5, 6, 11, 14, 18], 'brown': [2, 4, 5, 15, 42, 84, 96], 'fox': [5, 7, 8, 13]}
max_score of 3.200000 for ['the', 'brown']
should only see this once... (IndexError('list index out of range'))
scoring because max score: 3.200000 ignored, current_lowest: 0.000000
fixed_score for "the" and doc_id 2 is 0.000000
fixed_score for "brown" and doc_id 2 is 1.000000
summed score was 1.000000, lowest score was 0.000000
found a new score: 1.000000
still scoring: {'the': [3, 7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'brown': [4, 5, 15, 42, 84, 96], 'quick': [5, 6, 11, 14, 18], 'fox': [5, 7, 8, 13]}
max_score of 0.900000 for ['the']
scoring because max score: 0.900000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 3 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'brown': [4, 5, 15, 42, 84, 96], 'quick': [5, 6, 11, 14, 18], 'fox': [5, 7, 8, 13]}
max_score of 2.300000 for ['brown']
scoring because max score: 2.300000 ignored, current_lowest: 1.000000
fixed_score for "brown" and doc_id 4 is 2.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
still scoring: {'brown': [5, 15, 42, 84, 96], 'quick': [5, 6, 11, 14, 18], 'fox': [5, 7, 8, 13], 'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19]}
max_score of 11.300000 for ['brown', 'quick', 'fox']
scoring because max score: 11.300000 ignored, current_lowest: 1.000000
fixed_score for "brown" and doc_id 5 is 2.000000
fixed_score for "quick" and doc_id 5 is 1.000000
fixed_score for "fox" and doc_id 5 is 4.000000
summed score was 7.000000, lowest score was 1.000000
found a new score: 7.000000
still scoring: {'brown': [15, 42, 84, 96], 'quick': [6, 11, 14, 18], 'fox': [7, 8, 13], 'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19]}
max_score of 1.900000 for ['quick']
scoring because max score: 1.900000 ignored, current_lowest: 1.000000
fixed_score for "quick" and doc_id 6 is 1.000000
summed score was 1.000000, lowest score was 1.000000
still scoring: {'quick': [11, 14, 18], 'fox': [7, 8, 13], 'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'brown': [15, 42, 84, 96]}
max_score of 8.000000 for ['fox', 'the']
scoring because max score: 8.000000 ignored, current_lowest: 1.000000
fixed_score for "fox" and doc_id 7 is 0.000000
fixed_score for "the" and doc_id 7 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'fox': [8, 13], 'the': [8, 9, 10, 11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'brown': [15, 42, 84, 96]}
max_score of 8.000000 for ['fox', 'the']
scoring because max score: 8.000000 ignored, current_lowest: 1.000000
fixed_score for "fox" and doc_id 8 is 2.000000
fixed_score for "the" and doc_id 8 is 0.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
still scoring: {'fox': [13], 'the': [9, 10, 11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'brown': [15, 42, 84, 96]}
max_score of 0.900000 for ['the']
scoring because max score: 0.900000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 9 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'the': [10, 11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'fox': [13], 'brown': [15, 42, 84, 96]}
max_score of 0.900000 for ['the']
scoring because max score: 0.900000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 10 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'the': [11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'fox': [13], 'brown': [15, 42, 84, 96]}
max_score of 2.800000 for ['the', 'quick']
scoring because max score: 2.800000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 11 is 0.000000
fixed_score for "quick" and doc_id 11 is 2.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
still scoring: {'the': [12, 13, 17, 18, 19], 'quick': [14, 18], 'fox': [13], 'brown': [15, 42, 84, 96]}
max_score of 0.900000 for ['the']
scoring because max score: 0.900000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 12 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'the': [13, 17, 18, 19], 'fox': [13], 'quick': [14, 18], 'brown': [15, 42, 84, 96]}
max_score of 8.000000 for ['the', 'fox']
scoring because max score: 8.000000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 13 is 1.000000
fixed_score for "fox" and doc_id 13 is 6.000000
summed score was 7.000000, lowest score was 1.000000
found a new score: 7.000000
still scoring: {'the': [17, 18, 19], 'quick': [14, 18], 'brown': [15, 42, 84, 96]}
max_score of 1.900000 for ['quick']
scoring because max score: 1.900000 ignored, current_lowest: 1.000000
fixed_score for "quick" and doc_id 14 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'quick': [18], 'brown': [15, 42, 84, 96], 'the': [17, 18, 19]}
max_score of 2.300000 for ['brown']
scoring because max score: 2.300000 ignored, current_lowest: 1.000000
fixed_score for "brown" and doc_id 15 is 1.000000
summed score was 1.000000, lowest score was 1.000000
still scoring: {'brown': [42, 84, 96], 'the': [17, 18, 19], 'quick': [18]}
max_score of 0.900000 for ['the']
scoring because max score: 0.900000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 17 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'the': [18, 19], 'quick': [18], 'brown': [42, 84, 96]}
max_score of 2.800000 for ['the', 'quick']
scoring because max score: 2.800000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 18 is 0.000000
fixed_score for "quick" and doc_id 18 is 2.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
still scoring: {'the': [19], 'brown': [42, 84, 96]}
max_score of 0.900000 for ['the']
scoring because max score: 0.900000 ignored, current_lowest: 1.000000
fixed_score for "the" and doc_id 19 is 0.000000
summed score was 0.000000, lowest score was 1.000000
still scoring: {'brown': [42, 84, 96]}
max_score of 2.300000 for ['brown']
scoring because max score: 2.300000 ignored, current_lowest: 1.000000
fixed_score for "brown" and doc_id 42 is 1.000000
summed score was 1.000000, lowest score was 1.000000
still scoring: {'brown': [84, 96]}
max_score of 2.300000 for ['brown']
scoring because max score: 2.300000 ignored, current_lowest: 1.000000
fixed_score for "brown" and doc_id 84 is 2.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
still scoring: {'brown': [96]}
max_score of 2.300000 for ['brown']
scoring because max score: 2.300000 ignored, current_lowest: 1.000000
fixed_score for "brown" and doc_id 96 is 2.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
ran out of postings
WAND enabled? False
times scored: 20, times skipped: 0
final scores: [[1, 2], [2, 4], [2, 18], [2, 8], [2, 11], [7, 13], [7, 5], [2, 84], [2, 96]]
still scoring: {'the': [2, 3, 7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'quick': [5, 6, 11, 14, 18], 'brown': [2, 4, 5, 15, 42, 84, 96], 'fox': [5, 7, 8, 13]}
max_score of 3.200000 for ['the', 'brown']
should only see this once... (IndexError('list index out of range'))
scoring because max score: 3.200000 > current_lowest: 0.000000
fixed_score for "the" and doc_id 2 is 0.000000
fixed_score for "brown" and doc_id 2 is 1.000000
summed score was 1.000000, lowest score was 0.000000
found a new score: 1.000000
still scoring: {'the': [3, 7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'brown': [4, 5, 15, 42, 84, 96], 'quick': [5, 6, 11, 14, 18], 'fox': [5, 7, 8, 13]}
max_score of 0.900000 for ['the']
WAND skipped because max_score: 0.900000 < current_lowest: 1.000000
still scoring: {'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'brown': [4, 5, 15, 42, 84, 96], 'quick': [5, 6, 11, 14, 18], 'fox': [5, 7, 8, 13]}
max_score of 2.300000 for ['brown']
scoring because max score: 2.300000 > current_lowest: 1.000000
fixed_score for "brown" and doc_id 4 is 2.000000
summed score was 2.000000, lowest score was 1.000000
found a new score: 2.000000
still scoring: {'brown': [5, 15, 42, 84, 96], 'quick': [5, 6, 11, 14, 18], 'fox': [5, 7, 8, 13], 'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19]}
max_score of 11.300000 for ['brown', 'quick', 'fox']
scoring because max score: 11.300000 > current_lowest: 1.000000
fixed_score for "brown" and doc_id 5 is 2.000000
fixed_score for "quick" and doc_id 5 is 1.000000
fixed_score for "fox" and doc_id 5 is 4.000000
summed score was 7.000000, lowest score was 1.000000
found a new score: 7.000000
still scoring: {'brown': [15, 42, 84, 96], 'quick': [6, 11, 14, 18], 'fox': [7, 8, 13], 'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19]}
max_score of 1.900000 for ['quick']
WAND skipped because max_score: 1.900000 < current_lowest: 2.000000
still scoring: {'quick': [11, 14, 18], 'fox': [7, 8, 13], 'the': [7, 8, 9, 10, 11, 12, 13, 17, 18, 19], 'brown': [15, 42, 84, 96]}
max_score of 8.000000 for ['fox', 'the']
scoring because max score: 8.000000 > current_lowest: 2.000000
fixed_score for "fox" and doc_id 7 is 0.000000
fixed_score for "the" and doc_id 7 is 0.000000
summed score was 0.000000, lowest score was 2.000000
still scoring: {'fox': [8, 13], 'the': [8, 9, 10, 11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'brown': [15, 42, 84, 96]}
max_score of 8.000000 for ['fox', 'the']
scoring because max score: 8.000000 > current_lowest: 2.000000
fixed_score for "fox" and doc_id 8 is 2.000000
fixed_score for "the" and doc_id 8 is 0.000000
summed score was 2.000000, lowest score was 2.000000
still scoring: {'fox': [13], 'the': [9, 10, 11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'brown': [15, 42, 84, 96]}
max_score of 0.900000 for ['the']
WAND skipped because max_score: 0.900000 < current_lowest: 2.000000
still scoring: {'the': [10, 11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'fox': [13], 'brown': [15, 42, 84, 96]}
max_score of 0.900000 for ['the']
WAND skipped because max_score: 0.900000 < current_lowest: 2.000000
still scoring: {'the': [11, 12, 13, 17, 18, 19], 'quick': [11, 14, 18], 'fox': [13], 'brown': [15, 42, 84, 96]}
max_score of 2.800000 for ['the', 'quick']
scoring because max score: 2.800000 > current_lowest: 2.000000
fixed_score for "the" and doc_id 11 is 0.000000
fixed_score for "quick" and doc_id 11 is 2.000000
summed score was 2.000000, lowest score was 2.000000
still scoring: {'the': [12, 13, 17, 18, 19], 'quick': [14, 18], 'fox': [13], 'brown': [15, 42, 84, 96]}
max_score of 0.900000 for ['the']
WAND skipped because max_score: 0.900000 < current_lowest: 2.000000
still scoring: {'the': [13, 17, 18, 19], 'fox': [13], 'quick': [14, 18], 'brown': [15, 42, 84, 96]}
max_score of 8.000000 for ['the', 'fox']
scoring because max score: 8.000000 > current_lowest: 2.000000
fixed_score for "the" and doc_id 13 is 1.000000
fixed_score for "fox" and doc_id 13 is 6.000000
summed score was 7.000000, lowest score was 2.000000
found a new score: 7.000000
still scoring: {'the': [17, 18, 19], 'quick': [14, 18], 'brown': [15, 42, 84, 96]}
max_score of 1.900000 for ['quick']
WAND skipped because max_score: 1.900000 < current_lowest: 7.000000
still scoring: {'quick': [18], 'brown': [15, 42, 84, 96], 'the': [17, 18, 19]}
max_score of 2.300000 for ['brown']
WAND skipped because max_score: 2.300000 < current_lowest: 7.000000
still scoring: {'brown': [42, 84, 96], 'the': [17, 18, 19], 'quick': [18]}
max_score of 0.900000 for ['the']
WAND skipped because max_score: 0.900000 < current_lowest: 7.000000
still scoring: {'the': [18, 19], 'quick': [18], 'brown': [42, 84, 96]}
max_score of 2.800000 for ['the', 'quick']
WAND skipped because max_score: 2.800000 < current_lowest: 7.000000
still scoring: {'the': [19], 'brown': [42, 84, 96]}
max_score of 0.900000 for ['the']
WAND skipped because max_score: 0.900000 < current_lowest: 7.000000
still scoring: {'brown': [42, 84, 96]}
max_score of 2.300000 for ['brown']
WAND skipped because max_score: 2.300000 < current_lowest: 7.000000
still scoring: {'brown': [84, 96]}
max_score of 2.300000 for ['brown']
WAND skipped because max_score: 2.300000 < current_lowest: 7.000000
still scoring: {'brown': [96]}
max_score of 2.300000 for ['brown']
WAND skipped because max_score: 2.300000 < current_lowest: 7.000000
ran out of postings
WAND enabled? True
times scored: 7, times skipped: 13
final scores: [[7, 5], [7, 13]]

################################ RESULTS
with_wand [[7, 13]] == no_wand [[7, 13]]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment