Skip to content

Instantly share code, notes, and snippets.

@xeraa
Last active May 12, 2024 21:56
Show Gist options
  • Save xeraa/d114c0bee0335fdcfe120f25870e2199 to your computer and use it in GitHub Desktop.
Save xeraa/d114c0bee0335fdcfe120f25870e2199 to your computer and use it in GitHub Desktop.
Elasticsearch example for chunked documents to multi-vectors and two different retrieval strategies
DELETE my-long-text-index
PUT my-long-text-index
{
"mappings": {
"properties": {
"my_long_text_field": {
"type": "nested", //because there can be multiple vectors per doc
"properties": {
"vector": {
"type": "dense_vector" //the vector used for ranking
},
"text_chunk": {
"type": "text" //the text from which the vector was created
}
}
}
}
}
}
// We'll search for the vector [5,5]
// Document with the closest chunk
PUT my-long-text-index/_doc/1
{
"my_long_text_field" : [
{
"vector" : [5,4],
"text_chunk" : "doc 1 chunk 1"
},
{
"vector" : [5,1],
"text_chunk" : "doc 1 chunk 2"
},
{
"vector" : [5,0],
"text_chunk" : "doc 1 chunk 3"
}
]
}
// Document with the second and third closest chunk
PUT my-long-text-index/_doc/2
{
"my_long_text_field" : [
{
"vector" : [5,3],
"text_chunk" : "doc 2 chunk 1"
},
{
"vector" : [5,2],
"text_chunk" : "doc 2 chunk 2"
},
{
"vector" : [5,0],
"text_chunk" : "doc 2 chunk 3"
}
]
}
// Document with the closest aggregated chunks
PUT my-long-text-index/_doc/3
{
"my_long_text_field" : [
{
"vector" : [5,1.9],
"text_chunk" : "doc 3 chunk 1"
},
{
"vector" : [5,1.8],
"text_chunk" : "doc 3 chunk 2"
},
{
"vector" : [5,1.7],
"text_chunk" : "doc 3 chunk 3"
}
]
}
GET my-long-text-index/_search
{
"knn": {
"field": "my_long_text_field.vector",
"query_vector": [5,5],
"inner_hits":{
"_source": false,
"fields": [ "my_long_text_field.text_chunk"
],
"size": 1 // Best chunk
}
},
"size": 2, // 2 closest documents
"_source": false
}
GET my-long-text-index/_search
{
"knn": {
"field": "my_long_text_field.vector",
"query_vector": [5,5],
"inner_hits":{
"_source": false,
"fields": [ "my_long_text_field.text_chunk"
],
"size": 2 // 2 best chunks
}
},
"size": 1, // Best document
"_source": false
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment