Skip to content

Instantly share code, notes, and snippets.

@seanmacavaney
Last active July 17, 2021 00:04
Show Gist options
  • Save seanmacavaney/7f22c5faac8acf25df4cf408135f31d4 to your computer and use it in GitHub Desktop.
Save seanmacavaney/7f22c5faac8acf25df4cf408135f31d4 to your computer and use it in GitHub Desktop.
import hashlib
import ir_datasets
logger = ir_datasets.log.easy()
dochash2ids = {}
for doc in logger.pbar(ir_datasets.load('msmarco-passage-v2').docs):
h = hashlib.md5(doc.text.encode()).digest()[:8] # pretty low chance (~0.05%) of any collision for 64-bit hash and 138M passages
if h not in dochash2ids:
dochash2ids[h] = []
dochash2ids[h].append(doc.doc_id)
duplicate_sets = []
with open('msmarco_passage_v2_duplicates.tsv', 'wt') as f:
for ids in dochash2ids.values():
if len(ids) > 1:
duplicate_sets.append(ids)
f.write('\t'.join(ids) + '\n')
ds = ir_datasets.load('msmarco-passage-v2').docs_store()
# verify that, in fact, there were no hash collisions
for ids in duplicate_sets:
assert len(set(doc.text for doc in ds.get_many_iter(ids))) == 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment