[Elasticsearch] Function score as post-processing, doesn't apply at query time: leading to confusing results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
import pprint | |
import time | |
if __name__ == '__main__': | |
ELASTICSEARCH_HOST = "localhost" | |
ELASTICSEARCH_PORT = "9200" | |
index_name = "vector_index" | |
#################################### | |
#### Connexion to elasticsearch #### | |
#################################### | |
elasticsearch = Elasticsearch( | |
host=ELASTICSEARCH_HOST, | |
port=ELASTICSEARCH_PORT, | |
request_timeout=60) | |
vector_dim = 3 | |
# Mapping | |
index_body = { | |
"settings": { | |
"index": { | |
"number_of_shards": "1", | |
"number_of_replicas": "1" | |
} | |
}, | |
"mappings": { | |
"properties": { | |
"my_vector": { | |
"type": "dense_vector", | |
"dims": vector_dim | |
}, | |
"my_text": { | |
"type": "keyword" | |
} | |
} | |
} | |
} | |
###################### | |
#### Create index #### | |
###################### | |
if elasticsearch.indices.exists(index_name): | |
elasticsearch.indices.delete(index=index_name) | |
elasticsearch.indices.create(index=index_name, body=index_body) | |
################################ | |
#### Insert documents in ES #### | |
################################ | |
documents = [ | |
{ | |
"my_text": "abc", | |
"my_vector": [1, 0, 0] | |
}, | |
{ | |
"my_text": "foo", | |
"my_vector": [0, 1, 0] | |
}, | |
{ | |
"my_text": "bar", | |
"my_vector": [0, 0, 1] | |
} | |
] | |
[elasticsearch.index(index=index_name, body=doc, id=i) | |
for i, doc in enumerate(documents)] | |
# Wait documents are registered in elastic | |
time.sleep(2) | |
####################################### | |
#### Query vector with a match all #### | |
####################################### | |
print("#######################################") | |
print("Query vector with a match all") | |
query_body = { | |
"size": 3, | |
"query": { | |
"function_score": { | |
"query": { | |
"match_all": {}}, | |
"functions": [{ | |
"script_score": { | |
"script": { | |
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0", | |
"params": { | |
"query_vector": [1, 0, 0]}}}}]}}} | |
results = elasticsearch.search(index=index_name, body=query_body) | |
print("Number of results:", len(results["hits"]["hits"])) | |
for i, res in enumerate(results["hits"]["hits"]): | |
print("Rank ", i) | |
print("Doc ID", res["_id"]) | |
print("Doc:", res["_source"]) | |
print("Doc score:", res["_score"]) | |
print("-------") | |
# Number of results: 3 | |
# Rank 0 | |
# Doc ID 0 | |
# Doc: {'my_text': 'abc', 'my_vector': [1, 0, 0]} | |
# Doc score: 51.0 | |
# ------- | |
# Rank 1 | |
# Doc ID 1 | |
# Doc: {'my_text': 'foo', 'my_vector': [0, 1, 0]} | |
# Doc score: 1.0 | |
# ------- | |
# Rank 2 | |
# Doc ID 2 | |
# Doc: {'my_text': 'bar', 'my_vector': [0, 0, 1]} | |
# Doc score: 1.0 | |
# ------- | |
############################### | |
#### Query vector AND text #### | |
############################### | |
print("#######################################") | |
print("Query vector AND text") | |
query_body = { | |
"explain": True, | |
"size": 3, | |
"query": { | |
"function_score": { | |
"query": { | |
"match": { | |
"my_text": { | |
"query": "abc" | |
} | |
} | |
}, | |
"functions": [{ | |
"script_score": { | |
"script": { | |
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0", | |
"params": { | |
"query_vector": [0, 0, 1]}}}}]}}} | |
results = elasticsearch.search(index=index_name, body=query_body) | |
print("Number of results:", len(results["hits"]["hits"])) | |
for i, res in enumerate(results["hits"]["hits"]): | |
print("Rank ", i) | |
print("Doc ID", res["_id"]) | |
print("Doc:", res["_source"]) | |
print("Doc score:", res["_score"]) | |
print("-------") | |
# Return only the doc with a perfect match on text. | |
# Not even the possibility to have the one with the good vector. | |
# The score is only based on text. | |
# Number of results: 1 | |
# Rank 0 | |
# Doc ID 0 | |
# Doc: {'my_text': 'abc', 'my_vector': [1, 0, 0]} | |
# Doc score: 0.9808292 | |
########################################################### | |
##### Query vector AND text that doesnt exist in data ##### | |
########################################################### | |
print("#######################################") | |
print("Query vector AND text that doesnt exist in data") | |
query_body = { | |
"size": 2, | |
"explain": True, | |
"query": { | |
"function_score": { | |
"query": { | |
"match": { | |
"my_text": { | |
"query": "xxx" | |
} | |
} | |
}, | |
"functions": [{ | |
"script_score": { | |
"script": { | |
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0", | |
"params": { | |
"query_vector": [0, 0, 1]}}}}]}}} | |
results = elasticsearch.search(index=index_name, body=query_body) | |
# No results, even if the vector exist in the data! | |
pprint.pprint(results) | |
# {'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1}, | |
# 'hits': {'hits': [], | |
# 'max_score': None, | |
# 'total': {'relation': 'eq', 'value': 0}}, | |
# 'timed_out': False, | |
# 'took': 0} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment