Skip to content

Instantly share code, notes, and snippets.

@bclavie
Created June 11, 2024 03:11
Show Gist options
  • Save bclavie/f7b041328615d52cf5c0a9caaf03fd5e to your computer and use it in GitHub Desktop.
Save bclavie/f7b041328615d52cf5c0a9caaf03fd5e to your computer and use it in GitHub Desktop.
# Fetch some text content in two different categories
from wikipediaapi import Wikipedia
wiki = Wikipedia('RAGBot/0.0', 'en')
docs = [{"text": x,
"category": "person"}
for x in wiki.page('Hayao_Miyazaki').text.split('\n\n')]
docs += [{"text": x,
"category": "film"}
for x in wiki.page('Spirited_Away').text.split('\n\n')]
# Enter LanceDB
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
# Initialise the embedding model
model_registry = get_registry().get("sentence-transformers")
model = model_registry.create(name="BAAI/bge-small-en-v1.5")
# Create a Model to store attributes for filtering
class Document(LanceModel):
text: str = model.SourceField()
vector: Vector(384) = model.VectorField()
category: str
db = lancedb.connect(".my_db")
tbl = db.create_table("my_table", schema=Document)
# Embed the documents and store them in the database
tbl.add(docs)
# Generate the full-text (tf-idf) search index
tbl.create_fts_index("text")
# Initialise a reranker -- here, Cohere's API one
from lancedb.rerankers import CohereReranker
reranker = CohereReranker()
query = "What is Chihiro's new name given to her by the witch?"
results = (tbl.search(query, query_type="hybrid") # Hybrid means text + vector
.where("category = 'film'", prefilter=True) # Restrict to only docs in the 'film' category
.limit(10) # Get 10 results from first-pass retrieval
.rerank(reranker=reranker) # For the reranker to compute the final ranking
)
results.to_pandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment