Skip to content

Instantly share code, notes, and snippets.

@hweller1
Last active March 1, 2024 00:58
Show Gist options
  • Save hweller1/a2500d97ab8d4c964364e11cf54fb965 to your computer and use it in GitHub Desktop.
Save hweller1/a2500d97ab8d4c964364e11cf54fb965 to your computer and use it in GitHub Desktop.
basic example showing how to use reciprocal rank fusion to join results of a vector and full text search on indexes built from the same collection, with the same query
import os
import pymongo
os.environ["OPENAI_API_KEY"] = '<openai API key>'
import openai
### SETUP
connection_str = "<mongodb cluster conection str>"
client = pymongo.MongoClient(connection_str)
db = client['<name of db>']
coll = db['<name of collection>']
### CONFIGURATION PARAMETERS
vector_scalar = 0.9
fts_scalar = 1 - vector_scalar
k = 10
overrequest_factor = 10
### QUERY
query = "How does self-attention work?"
embeddings = openai.Embedding.create(
input=query,
model="text-embedding-ada-002"
)
embeddings = embeddings.data[0].embedding
vector_agg_with_lookup = [
{
"$search": {"index":'<name of vector search index>',
"knnBeta": {
"path": "embedding",
"vector": embeddings,
"k": k * overrequest_factor,
"filter": {'text': {'path': 'doc_level', 'query': 'sentence'}}},
}},
{
"$addFields": {
"vs_score": {
"$meta": "searchScore"
}}}, {
"$project": {
"vs_score": {"$multiply": ["$vs_score", vector_scalar]},
"_id": 1,
"text": 1
}
},
{"$limit": k},
{
"$lookup": {
"from": "<name of collection>",
"localField": "_id",
"foreignField": "_id",
"as": "joined_results",
"pipeline": [
{
"$search": {
"index": "<name of full text search index built on same collection>",
"text": {
"query": query,
"path": "text"
}}
},{
"$addFields": {
"fts_score": {
"$meta": "searchScore"
}}}, {
"$project": {
"fts_score": {"$multiply": ["$fts_score", fts_scalar]},
"_id": 1,
"text": 1
}
},
]
}
},
{"$unwind": "$joined_results"},
{"$group" : {"_id": "$text", "vs_score" : {"$first":"$vs_score"}, "fts_score": {"$first":"$joined_results.fts_score"} }},
{"$project": {"text": 1,"score": { "$add": ["$fts_score", "$vs_score"] }, "_id": 1, "vs_score": 1, "fts_score": 1}},
{"$sort": {"score": -1}}
]
x = coll.aggregate(vector_agg_with_lookup)
import pdb; pdb.set_trace()
# use x.next() to unpack 10 results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment