Skip to content

Instantly share code, notes, and snippets.

@davidmezzetti
Last active July 5, 2024 16:26
Show Gist options
  • Save davidmezzetti/751328b80c6653c33063052e1b69f6da to your computer and use it in GitHub Desktop.
Save davidmezzetti/751328b80c6653c33063052e1b69f6da to your computer and use it in GitHub Desktop.
##################################
# Data functions
##################################
import re
from datasets import load_dataset
def clean(text):
text = text.replace("\n", " ").strip()
return re.sub(r"\s{2,}", " ", text)
def stream():
arxiv = load_dataset("arxiv_dataset", split="train")
for result in arxiv:
yield f"{clean(result['title'])}\n{clean(result['abstract'])}"
def batch(size):
data = []
for result in stream():
data.append(result)
if len(data) == size:
yield data
data = []
if data:
yield data
##################################
# ChromaDB
##################################
import time
import uuid
from chromadb import PersistentClient
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
start = time.time()
# Create vector store - OOM issues (w/ 32 GB RAM) using in-memory client
client = PersistentClient(path="chromadb")
collection = client.create_collection(
"default",
embedding_function=SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2", device="cuda")
)
# Index data
for batch in batch(1024):
collection.add(ids=[str(uuid.uuid4()) for _ in batch], documents=batch)
print(f"ELAPSED = {time.time() - start:.2f}s")
##################################
# ChromaDB with LangChain
##################################
import time
from chromadb import PersistentClient
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
start = time.time()
# Create embeddings
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={"device": "cuda"}
)
# Create vector store
# OOM issues (w/ 32 GB RAM) using Chroma in-memory client and FAISS
index = None
for batch in batch(1024):
if not index:
index = Chroma.from_texts(batch, embeddings, client=PersistentClient(path="chromadb"))
else:
index.add_texts(batch)
print(f"ELAPSED = {time.time() - start:.2f}s")
##################################
# txtai
##################################
import time
from txtai import Embeddings
start = time.time()
# Create vector store. Uses SQLite + Hnswlib.
embeddings = Embeddings(
path="sentence-transformers/all-MiniLM-L6-v2",
backend="hnsw",
content=True,
maxlength=True
)
embeddings.index(stream())
print(f"ELAPSED = {time.time() - start:.2f}s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment