Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomas-chauvet/0a8317296800ecd3a92cb926e260fe7d to your computer and use it in GitHub Desktop.
Save thomas-chauvet/0a8317296800ecd3a92cb926e260fe7d to your computer and use it in GitHub Desktop.
[Elasticsearch] Function score as post-processing, doesn't apply at query time: leading to confusing results
from elasticsearch import Elasticsearch
import pprint
import time
if __name__ == '__main__':
ELASTICSEARCH_HOST = "localhost"
ELASTICSEARCH_PORT = "9200"
index_name = "vector_index"
####################################
#### Connexion to elasticsearch ####
####################################
elasticsearch = Elasticsearch(
host=ELASTICSEARCH_HOST,
port=ELASTICSEARCH_PORT,
request_timeout=60)
vector_dim = 3
# Mapping
index_body = {
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "1"
}
},
"mappings": {
"properties": {
"my_vector": {
"type": "dense_vector",
"dims": vector_dim
},
"my_text": {
"type": "keyword"
}
}
}
}
######################
#### Create index ####
######################
if elasticsearch.indices.exists(index_name):
elasticsearch.indices.delete(index=index_name)
elasticsearch.indices.create(index=index_name, body=index_body)
################################
#### Insert documents in ES ####
################################
documents = [
{
"my_text": "abc",
"my_vector": [1, 0, 0]
},
{
"my_text": "foo",
"my_vector": [0, 1, 0]
},
{
"my_text": "bar",
"my_vector": [0, 0, 1]
}
]
[elasticsearch.index(index=index_name, body=doc, id=i)
for i, doc in enumerate(documents)]
# Wait documents are registered in elastic
time.sleep(2)
#######################################
#### Query vector with a match all ####
#######################################
print("#######################################")
print("Query vector with a match all")
query_body = {
"size": 3,
"query": {
"function_score": {
"query": {
"match_all": {}},
"functions": [{
"script_score": {
"script": {
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0",
"params": {
"query_vector": [1, 0, 0]}}}}]}}}
results = elasticsearch.search(index=index_name, body=query_body)
print("Number of results:", len(results["hits"]["hits"]))
for i, res in enumerate(results["hits"]["hits"]):
print("Rank ", i)
print("Doc ID", res["_id"])
print("Doc:", res["_source"])
print("Doc score:", res["_score"])
print("-------")
# Number of results: 3
# Rank 0
# Doc ID 0
# Doc: {'my_text': 'abc', 'my_vector': [1, 0, 0]}
# Doc score: 51.0
# -------
# Rank 1
# Doc ID 1
# Doc: {'my_text': 'foo', 'my_vector': [0, 1, 0]}
# Doc score: 1.0
# -------
# Rank 2
# Doc ID 2
# Doc: {'my_text': 'bar', 'my_vector': [0, 0, 1]}
# Doc score: 1.0
# -------
###############################
#### Query vector AND text ####
###############################
print("#######################################")
print("Query vector AND text")
query_body = {
"explain": True,
"size": 3,
"query": {
"function_score": {
"query": {
"match": {
"my_text": {
"query": "abc"
}
}
},
"functions": [{
"script_score": {
"script": {
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0",
"params": {
"query_vector": [0, 0, 1]}}}}]}}}
results = elasticsearch.search(index=index_name, body=query_body)
print("Number of results:", len(results["hits"]["hits"]))
for i, res in enumerate(results["hits"]["hits"]):
print("Rank ", i)
print("Doc ID", res["_id"])
print("Doc:", res["_source"])
print("Doc score:", res["_score"])
print("-------")
# Return only the doc with a perfect match on text.
# Not even the possibility to have the one with the good vector.
# The score is only based on text.
# Number of results: 1
# Rank 0
# Doc ID 0
# Doc: {'my_text': 'abc', 'my_vector': [1, 0, 0]}
# Doc score: 0.9808292
###########################################################
##### Query vector AND text that doesnt exist in data #####
###########################################################
print("#######################################")
print("Query vector AND text that doesnt exist in data")
query_body = {
"size": 2,
"explain": True,
"query": {
"function_score": {
"query": {
"match": {
"my_text": {
"query": "xxx"
}
}
},
"functions": [{
"script_score": {
"script": {
"source": "50 * cosineSimilarity(params.query_vector, doc['my_vector']) + 1.0",
"params": {
"query_vector": [0, 0, 1]}}}}]}}}
results = elasticsearch.search(index=index_name, body=query_body)
# No results, even if the vector exist in the data!
pprint.pprint(results)
# {'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
# 'hits': {'hits': [],
# 'max_score': None,
# 'total': {'relation': 'eq', 'value': 0}},
# 'timed_out': False,
# 'took': 0}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment