Skip to content

Instantly share code, notes, and snippets.

@davidmezzetti
Last active July 6, 2024 09:23
Show Gist options
  • Save davidmezzetti/f4ee01fec59217ea3c32bc74dec0d792 to your computer and use it in GitHub Desktop.
Save davidmezzetti/f4ee01fec59217ea3c32bc74dec0d792 to your computer and use it in GitHub Desktop.
##################################
# Data functions
##################################
import re
from datasets import load_dataset
def clean(text):
text = text.replace("\n", " ").strip()
return re.sub(r"\s{2,}", " ", text)
def stream():
arxiv = load_dataset("arxiv_dataset", split="train[:1600000]")
for result in arxiv:
yield f"{clean(result['title'])}\n{clean(result['abstract'])}"
def batch(size):
data = []
for result in stream():
data.append(result)
if len(data) == size:
yield data
data = []
if data:
yield data
def queries(size):
for text in next(batch(size)):
# Get first 5 terms in title as query
title = text.split("\n")[0]
yield " ".join(title.split()[:5])
##################################
# LangChain
##################################
import time
from langchain_community.retrievers import BM25Retriever
from psutil import Process
start = time.time()
retriever = BM25Retriever.from_texts(list(stream()), k=3)
index = time.time() - start
start = time.time()
for query in queries(100):
retriever.invoke(query)
memory = int(Process().memory_info().rss / (1024 * 1024))
print(f"INDEX TIME = {index:.2f}s")
print(f"SEARCH TIME = {time.time() - start:.2f}s")
print(f"MEMORY = {memory} MB")
##################################
# txtai
##################################
import time
from psutil import Process
from txtai import Embeddings
start = time.time()
embeddings = Embeddings(content=True, keyword=True)
embeddings.index(stream())
index = time.time() - start
start = time.time()
for query in queries(100):
embeddings.search(query, 3)
memory = int(Process().memory_info().rss / (1024 * 1024))
print(f"INDEX TIME = {index:.2f}s")
print(f"SEARCH TIME = {time.time() - start:.2f}s")
print(f"MEMORY = {memory} MB")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment