Skip to content

Instantly share code, notes, and snippets.

@jmazanec15
Created May 12, 2021 23:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmazanec15/7ca3ff79101db0a283964f7424a83f87 to your computer and use it in GitHub Desktop.
Save jmazanec15/7ca3ff79101db0a283964f7424a83f87 to your computer and use it in GitHub Desktop.
"""
Repro attempt for https://discuss.opendistrocommunity.dev/t/reindexing-produces-different-result-on-the-same-query-vector/5564/8
"""
from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers
import math
import random
def get_es():
return Elasticsearch(
hosts=[{'host': "localhost", 'port': 9200}],
use_ssl=False,
verify_certs=False,
connection_class=RequestsHttpConnection
)
def create_index(es, index_name, index_mapping, index_settings):
print("Creating index {} and first deleting it if it exists...".format(index_name))
es.indices.delete(index=index_name, ignore=[400, 404])
request_body = {
'settings': index_settings,
'mappings': index_mapping
}
es.indices.create(index=index_name, body=request_body)
def index_docs(es, index_name, field_name, vectors):
print("Indexing {} documents for index: {}...".format(len(vectors), index_name))
bulk_data = list()
bulk_size = 200
for i, doc in enumerate(vectors):
op_dict = {
"index": {
"_index": index_name,
"_id": i
}
}
bulk_data.append(op_dict)
bulk_data.append({field_name: doc})
if len(bulk_data) >= 2 * bulk_size:
print("Doc count: {}".format(i + 1))
es.bulk(index=index_name, body=bulk_data, request_timeout=30)
bulk_data = list()
if len(bulk_data) != 0:
es.bulk(index=index_name, body=bulk_data, request_timeout=30)
es.indices.refresh(index=index_name)
def get_vectors(es, index_name, field_name, vector_count):
print("Getting vectors for index {}...".format(index_name))
vectors = list()
for i in range(vector_count):
res = es.get(index=index_name, id=i)
vectors.append(res['_source'][field_name])
return vectors
def reindex(es, index_name_1, index_name_2):
print("Reindexing...")
helpers.reindex(client=es, source_index=index_name_1, target_index=index_name_2)
def compare_vectors(vectors_1, vectors_2):
print("Comparing vectors...")
for i in range(len(vectors_1)):
for j in range(len(vectors_1[i])):
if not math.isclose(vectors_1[i][j], vectors_2[i][j]):
print("vector_1[{}][{}] = {}\nvector_2[{}][{}] = {}\n".format(i, j, vectors_1[i][j], i, j,
vectors_2[i][j]))
def main():
# Generate random vectors
# random.seed(0)
doc_count = 1000
dim = 512
index_vectors = [[random.random() for _ in range(dim)] for _ in range(doc_count)]
# Create first index
es = get_es()
index_name_1 = "index_1"
field_name = "test_field"
settings = {
"index": {
"knn": True,
"knn.algo_param.m": 32,
"knn.algo_param.ef_construction": 1024,
"knn.algo_param.ef_search": 1024,
"max_inner_result_window": 6,
}
}
mapping = {
"properties": {
field_name: {
"type": "knn_vector",
"dimension": dim
}
}
}
create_index(es, index_name_1, mapping, settings)
# Index vectors
index_docs(es, index_name_1, field_name, index_vectors)
# Get all vectors
vectors_1 = get_vectors(es, index_name_1, field_name, len(index_vectors))
# Create second index
index_name_2 = "index_2"
create_index(es, index_name_2, mapping, settings)
# Reindex
reindex(es, index_name_1, index_name_2)
# Get all vectors
vectors_2 = get_vectors(es, index_name_2, field_name, len(index_vectors))
# Compare vectors in 1 to vectors in 2
compare_vectors(vectors_1, vectors_2)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment