Simple RAG for easily embedding documents and querying embeddings
This file provides a class, Collection, that makes it easy to add retrieval
augmented generation (RAG) to an application.
There are so many overly complex RAG tools out there, such as LlamaIndex and
LangChain. Even Chroma can be overly complex for some use cases, and I've
run into issues (in Streamlit Share) where Chroma's dependency on SQLite
caused a conflict that I couldn't resolve. Argh!
So I created the below to serve as a very simple, lightweight, low-dependency
solution to add RAG to an application. I also coded it to be a fairly easy
drop-in replacement for Chroma.
Its one major limitation is that it doesn't come with a database. This means
everything is stored in memory. This is fine for small collections, but if
you have a large collection, or don't want to constantly embed the same
documents, you'll probably want to use a solution with a database like Chroma.
The code below uses OpenAI embeddings. To use these, you'll need to get an
OpenAI API key and set an environment variable for OPENAI_API_KEY. If you don't
want to use OpenAI embeddings, you can swap those out for any other embeddings.
Just make sure you use the same embedding function for both adding documents
and querying them.
pip install numpy scikit-learn openai
from simple_rag import Collection
collection = Collection()
ids=["1", "2"],
documents=["This is a document.", "This is another document."],
metadatas=[{"url": ""}, {"url": ""}],
results = collection.query(query_texts=["Find a document"])
>>> {'documents': [['This is a document.', 'This is another document.']],
>>> 'distances': [[0.1539412288910481, 0.17489997771983146]], 'metadatas':
>>> [[{'url': ''}, {'url': ''}]]}
import os
import numpy as np
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
class Collection:
"""A collection of documents with associated metadata and embeddings."""
def __init__(self):
self.documents = []
self.ids = []
self.metadatas = []
self.embeddings = []
def add(self, ids: list[str], documents: list[str], metadatas: list[dict]):
"""Adds documents to the collection."""
embeddings = self._embed_documents(documents)
if embeddings is not None:
def query(self, query_texts: list[str], min_distance: float = 0.3) -> dict:
"""Queries the collection for documents similar to the query texts."""
query_embeddings = self._embed_documents(query_texts)
results = {"documents": [], "distances": [], "metadatas": []}
if query_embeddings is not None:
for query_embedding in query_embeddings:
distances = (
- cosine_similarity([query_embedding], self.embeddings)[0]
relevant_indices = np.where(distances <= min_distance)[0]
[self.documents[i] for i in relevant_indices]
[distances[i] for i in relevant_indices]
[self.metadatas[i] for i in relevant_indices]
return results
def _embed_documents(self, documents: list[str]) -> np.ndarray | None:
"""Embeds documents. If you want to use something other than OpenAI
embeddings, you can change up this function."""
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
model = "text-embedding-ada-002"
max_batch_size = 2048 # Max batch size for OpenAI embeddings
embeddings = []
for i in range(0, len(documents), max_batch_size):
batch = documents[i : i + max_batch_size] # noqa
response = client.embeddings.create(model=model, input=batch)
batch_embeddings = [data.embedding for data in]
except Exception as e:
print(f"Error embedding documents: {e}")
embeddings.extend([None] * len(batch))
if embeddings:
return np.array(embeddings)
return None
