Created
May 12, 2021 23:10
-
-
Save jmazanec15/7ca3ff79101db0a283964f7424a83f87 to your computer and use it in GitHub Desktop.
Reproduction steps for https://discuss.opendistrocommunity.dev/t/reindexing-produces-different-result-on-the-same-query-vector/5564/8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Repro attempt for https://discuss.opendistrocommunity.dev/t/reindexing-produces-different-result-on-the-same-query-vector/5564/8 | |
""" | |
from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers | |
import math | |
import random | |
def get_es(): | |
return Elasticsearch( | |
hosts=[{'host': "localhost", 'port': 9200}], | |
use_ssl=False, | |
verify_certs=False, | |
connection_class=RequestsHttpConnection | |
) | |
def create_index(es, index_name, index_mapping, index_settings): | |
print("Creating index {} and first deleting it if it exists...".format(index_name)) | |
es.indices.delete(index=index_name, ignore=[400, 404]) | |
request_body = { | |
'settings': index_settings, | |
'mappings': index_mapping | |
} | |
es.indices.create(index=index_name, body=request_body) | |
def index_docs(es, index_name, field_name, vectors): | |
print("Indexing {} documents for index: {}...".format(len(vectors), index_name)) | |
bulk_data = list() | |
bulk_size = 200 | |
for i, doc in enumerate(vectors): | |
op_dict = { | |
"index": { | |
"_index": index_name, | |
"_id": i | |
} | |
} | |
bulk_data.append(op_dict) | |
bulk_data.append({field_name: doc}) | |
if len(bulk_data) >= 2 * bulk_size: | |
print("Doc count: {}".format(i + 1)) | |
es.bulk(index=index_name, body=bulk_data, request_timeout=30) | |
bulk_data = list() | |
if len(bulk_data) != 0: | |
es.bulk(index=index_name, body=bulk_data, request_timeout=30) | |
es.indices.refresh(index=index_name) | |
def get_vectors(es, index_name, field_name, vector_count): | |
print("Getting vectors for index {}...".format(index_name)) | |
vectors = list() | |
for i in range(vector_count): | |
res = es.get(index=index_name, id=i) | |
vectors.append(res['_source'][field_name]) | |
return vectors | |
def reindex(es, index_name_1, index_name_2): | |
print("Reindexing...") | |
helpers.reindex(client=es, source_index=index_name_1, target_index=index_name_2) | |
def compare_vectors(vectors_1, vectors_2): | |
print("Comparing vectors...") | |
for i in range(len(vectors_1)): | |
for j in range(len(vectors_1[i])): | |
if not math.isclose(vectors_1[i][j], vectors_2[i][j]): | |
print("vector_1[{}][{}] = {}\nvector_2[{}][{}] = {}\n".format(i, j, vectors_1[i][j], i, j, | |
vectors_2[i][j])) | |
def main(): | |
# Generate random vectors | |
# random.seed(0) | |
doc_count = 1000 | |
dim = 512 | |
index_vectors = [[random.random() for _ in range(dim)] for _ in range(doc_count)] | |
# Create first index | |
es = get_es() | |
index_name_1 = "index_1" | |
field_name = "test_field" | |
settings = { | |
"index": { | |
"knn": True, | |
"knn.algo_param.m": 32, | |
"knn.algo_param.ef_construction": 1024, | |
"knn.algo_param.ef_search": 1024, | |
"max_inner_result_window": 6, | |
} | |
} | |
mapping = { | |
"properties": { | |
field_name: { | |
"type": "knn_vector", | |
"dimension": dim | |
} | |
} | |
} | |
create_index(es, index_name_1, mapping, settings) | |
# Index vectors | |
index_docs(es, index_name_1, field_name, index_vectors) | |
# Get all vectors | |
vectors_1 = get_vectors(es, index_name_1, field_name, len(index_vectors)) | |
# Create second index | |
index_name_2 = "index_2" | |
create_index(es, index_name_2, mapping, settings) | |
# Reindex | |
reindex(es, index_name_1, index_name_2) | |
# Get all vectors | |
vectors_2 = get_vectors(es, index_name_2, field_name, len(index_vectors)) | |
# Compare vectors in 1 to vectors in 2 | |
compare_vectors(vectors_1, vectors_2) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment