Skip to content

Instantly share code, notes, and snippets.

@davidmezzetti
Created July 7, 2024 11:07
Show Gist options
  • Save davidmezzetti/ad5abbf36eb08694b2aa7463b8047034 to your computer and use it in GitHub Desktop.
Save davidmezzetti/ad5abbf36eb08694b2aa7463b8047034 to your computer and use it in GitHub Desktop.
##################################
# Data functions
##################################
import re
from datasets import load_dataset
def clean(text):
text = text.replace("\n", " ").strip()
return re.sub(r"\s{2,}", " ", text)
def stream():
arxiv = load_dataset("arxiv_dataset", split="train[:1600000]")
for result in arxiv:
yield f"{clean(result['title'])}\n{clean(result['abstract'])}"
def batch(size):
data = []
for result in stream():
data.append(result)
if len(data) == size:
yield data
data = []
if data:
yield data
def queries(size):
for text in next(batch(size)):
# Get first 5 terms in title as query
title = text.split("\n")[0]
yield " ".join(title.split()[:5])
##################################
# LangChain
##################################
import time
from langchain_community.retrievers import BM25Retriever
from psutil import Process
from txtai.pipeline import Tokenizer
start = time.time()
# Tokenize using UAX #29 like Apache Lucene's standard tokenizer
retriever = BM25Retriever.from_texts(list(stream()), k=3, preprocess_func=Tokenizer())
index = time.time() - start
start = time.time()
for query in queries(100):
retriever.invoke(query)
memory = int(Process().memory_info().rss / (1024 * 1024))
print(f"INDEX TIME = {index:.2f}s")
print(f"SEARCH TIME = {time.time() - start:.2f}s")
print(f"MEMORY = {memory} MB")
##################################
# bm25s
##################################
import time
from bm25s import BM25
from psutil import Process
from txtai.pipeline import Tokenizer
start = time.time()
# Tokenize using UAX #29 like Apache Lucene's standard tokenizer
tokenizer = Tokenizer()
retriever = BM25()
retriever.index([tokenizer(x) for x in stream()])
index = time.time() - start
start = time.time()
for query in queries(100):
retriever.retrieve(tokenizer(query), k=3)
memory = int(Process().memory_info().rss / (1024 * 1024))
print(f"INDEX TIME = {index:.2f}s")
print(f"SEARCH TIME = {time.time() - start:.2f}s")
print(f"MEMORY = {memory} MB")
##################################
# txtai
##################################
import time
from psutil import Process
from txtai import Embeddings
start = time.time()
embeddings = Embeddings(content=True, keyword=True)
embeddings.index(stream())
index = time.time() - start
start = time.time()
for query in queries(100):
embeddings.search(query, 3)
memory = int(Process().memory_info().rss / (1024 * 1024))
print(f"INDEX TIME = {index:.2f}s")
print(f"SEARCH TIME = {time.time() - start:.2f}s")
print(f"MEMORY = {memory} MB")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment