Last active
March 1, 2024 00:58
-
-
Save hweller1/a2500d97ab8d4c964364e11cf54fb965 to your computer and use it in GitHub Desktop.
basic example showing how to use reciprocal rank fusion to join results of a vector and full text search on indexes built from the same collection, with the same query
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pymongo | |
os.environ["OPENAI_API_KEY"] = '<openai API key>' | |
import openai | |
### SETUP | |
connection_str = "<mongodb cluster conection str>" | |
client = pymongo.MongoClient(connection_str) | |
db = client['<name of db>'] | |
coll = db['<name of collection>'] | |
### CONFIGURATION PARAMETERS | |
vector_scalar = 0.9 | |
fts_scalar = 1 - vector_scalar | |
k = 10 | |
overrequest_factor = 10 | |
### QUERY | |
query = "How does self-attention work?" | |
embeddings = openai.Embedding.create( | |
input=query, | |
model="text-embedding-ada-002" | |
) | |
embeddings = embeddings.data[0].embedding | |
vector_agg_with_lookup = [ | |
{ | |
"$search": {"index":'<name of vector search index>', | |
"knnBeta": { | |
"path": "embedding", | |
"vector": embeddings, | |
"k": k * overrequest_factor, | |
"filter": {'text': {'path': 'doc_level', 'query': 'sentence'}}}, | |
}}, | |
{ | |
"$addFields": { | |
"vs_score": { | |
"$meta": "searchScore" | |
}}}, { | |
"$project": { | |
"vs_score": {"$multiply": ["$vs_score", vector_scalar]}, | |
"_id": 1, | |
"text": 1 | |
} | |
}, | |
{"$limit": k}, | |
{ | |
"$lookup": { | |
"from": "<name of collection>", | |
"localField": "_id", | |
"foreignField": "_id", | |
"as": "joined_results", | |
"pipeline": [ | |
{ | |
"$search": { | |
"index": "<name of full text search index built on same collection>", | |
"text": { | |
"query": query, | |
"path": "text" | |
}} | |
},{ | |
"$addFields": { | |
"fts_score": { | |
"$meta": "searchScore" | |
}}}, { | |
"$project": { | |
"fts_score": {"$multiply": ["$fts_score", fts_scalar]}, | |
"_id": 1, | |
"text": 1 | |
} | |
}, | |
] | |
} | |
}, | |
{"$unwind": "$joined_results"}, | |
{"$group" : {"_id": "$text", "vs_score" : {"$first":"$vs_score"}, "fts_score": {"$first":"$joined_results.fts_score"} }}, | |
{"$project": {"text": 1,"score": { "$add": ["$fts_score", "$vs_score"] }, "_id": 1, "vs_score": 1, "fts_score": 1}}, | |
{"$sort": {"score": -1}} | |
] | |
x = coll.aggregate(vector_agg_with_lookup) | |
import pdb; pdb.set_trace() | |
# use x.next() to unpack 10 results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment