Created
January 29, 2020 13:04
-
-
Save thomas-chauvet/0a8317296800ecd3a92cb926e260fe7d to your computer and use it in GitHub Desktop.
[Elasticsearch] Function score as post-processing, doesn't apply at query time: leading to confusing results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
import pprint | |
import time | |
if __name__ == '__main__': | |
ELASTICSEARCH_HOST = "localhost" | |
ELASTICSEARCH_PORT = "9200" | |
index_name = "vector_index" | |
#################################### | |
#### Connexion to elasticsearch #### | |
#################################### | |
elasticsearch = Elasticsearch( | |
host=ELASTICSEARCH_HOST, | |
port=ELASTICSEARCH_PORT, | |
request_timeout=60) | |
vector_dim = 3 | |
# Mapping | |
index_body = { | |
"settings": { | |
"index": { | |
"number_of_shards": "1", | |
"number_of_replicas": "1" | |
} | |
}, | |
"mappings": { | |
"properties": { | |
"my_vector": { | |
"type": "dense_vector", | |
"dims": vector_dim | |
}, | |
"my_text": { | |
"type": "keyword" | |
} | |
} | |
} | |
} | |
###################### | |
#### Create index #### | |
###################### | |
if elasticsearch.indices.exists(index_name): | |
elasticsearch.indices.delete(index=index_name) | |
elasticsearch.indices.create(index=index_name, body=index_body) | |
################################ | |
#### Insert documents in ES #### | |
################################ | |
documents = [ | |
{ | |
"my_text": "abc", | |
"my_vector": [1, 0, 0] | |
}, | |
{ | |
"my_text": "foo", | |
"my_vector": [0, 1, 0] | |
}, | |
{ | |
"my_text": "bar", | |
"my_vector": [0, 0, 1] | |
} | |
] | |
[elasticsearch.index(index=index_name, body=doc, id=i) | |
for i, doc in enumerate(documents)] | |
# Wait documents are registered in elastic | |
time.sleep(2) | |
####################################### | |
#### Query vector with a match all #### | |
####################################### | |
print("#######################################") | |
print("Query vector with a match all") | |
query_body = { | |
"size": 3, | |
"query": { | |
"function_score": { | |
"query": { | |
"match_all": {}}, | |
"functions": [{ | |
"script_score": { | |
"script": { | |
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0", | |
"params": { | |
"query_vector": [1, 0, 0]}}}}]}}} | |
results = elasticsearch.search(index=index_name, body=query_body) | |
print("Number of results:", len(results["hits"]["hits"])) | |
for i, res in enumerate(results["hits"]["hits"]): | |
print("Rank ", i) | |
print("Doc ID", res["_id"]) | |
print("Doc:", res["_source"]) | |
print("Doc score:", res["_score"]) | |
print("-------") | |
# Number of results: 3 | |
# Rank 0 | |
# Doc ID 0 | |
# Doc: {'my_text': 'abc', 'my_vector': [1, 0, 0]} | |
# Doc score: 51.0 | |
# ------- | |
# Rank 1 | |
# Doc ID 1 | |
# Doc: {'my_text': 'foo', 'my_vector': [0, 1, 0]} | |
# Doc score: 1.0 | |
# ------- | |
# Rank 2 | |
# Doc ID 2 | |
# Doc: {'my_text': 'bar', 'my_vector': [0, 0, 1]} | |
# Doc score: 1.0 | |
# ------- | |
############################### | |
#### Query vector AND text #### | |
############################### | |
print("#######################################") | |
print("Query vector AND text") | |
query_body = { | |
"explain": True, | |
"size": 3, | |
"query": { | |
"function_score": { | |
"query": { | |
"match": { | |
"my_text": { | |
"query": "abc" | |
} | |
} | |
}, | |
"functions": [{ | |
"script_score": { | |
"script": { | |
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0", | |
"params": { | |
"query_vector": [0, 0, 1]}}}}]}}} | |
results = elasticsearch.search(index=index_name, body=query_body) | |
print("Number of results:", len(results["hits"]["hits"])) | |
for i, res in enumerate(results["hits"]["hits"]): | |
print("Rank ", i) | |
print("Doc ID", res["_id"]) | |
print("Doc:", res["_source"]) | |
print("Doc score:", res["_score"]) | |
print("-------") | |
# Return only the doc with a perfect match on text. | |
# Not even the possibility to have the one with the good vector. | |
# The score is only based on text. | |
# Number of results: 1 | |
# Rank 0 | |
# Doc ID 0 | |
# Doc: {'my_text': 'abc', 'my_vector': [1, 0, 0]} | |
# Doc score: 0.9808292 | |
########################################################### | |
##### Query vector AND text that doesnt exist in data ##### | |
########################################################### | |
print("#######################################") | |
print("Query vector AND text that doesnt exist in data") | |
query_body = { | |
"size": 2, | |
"explain": True, | |
"query": { | |
"function_score": { | |
"query": { | |
"match": { | |
"my_text": { | |
"query": "xxx" | |
} | |
} | |
}, | |
"functions": [{ | |
"script_score": { | |
"script": { | |
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0", | |
"params": { | |
"query_vector": [0, 0, 1]}}}}]}}} | |
results = elasticsearch.search(index=index_name, body=query_body) | |
# No results, even if the vector exist in the data! | |
pprint.pprint(results) | |
# {'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1}, | |
# 'hits': {'hits': [], | |
# 'max_score': None, | |
# 'total': {'relation': 'eq', 'value': 0}}, | |
# 'timed_out': False, | |
# 'took': 0} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment