Skip to content

Instantly share code, notes, and snippets.

View benwtrent's full-sized avatar
🏠
Working from home

Benjamin Trent benwtrent

🏠
Working from home
View GitHub Profile
@benwtrent
benwtrent / data_load_and_encode.py
Created February 16, 2024 16:09
Stupid binary encoding tests
import numpy as np
import pyarrow.parquet as pq
from sklearn.neighbors import NearestNeighbors
# load data/%d-en.parquet files into a single numpy metrix
# vector dimensions are 1024
# load data
tbls = []
for i in range(10):
@benwtrent
benwtrent / cohere_data.py
Last active January 31, 2024 15:26
download and format cohere data
import pyarrow.parquet as pq
import numpy as np
DATA_SETS =[
{"name": "wiki768", "files": [
"train-00000-of-00004-1a1932c9ca1c7152.parquet",
"train-00001-of-00004-f4a4f5540ade14b4.parquet",
"train-00002-of-00004-ff770df3ab420d14.parquet",
"train-00003-of-00004-85b3dbbc960e92ec.parquet",
@benwtrent
benwtrent / knnPerf.py
Last active August 11, 2023 00:10
Code used to create distributed data
#!/usr/bin/env/python
import os
import subprocess
import benchUtil
import constants
LUCENE_CHECKOUT = 'lucene_candidate'
# test parameters. This script will run KnnGraphTester on every combination of these parameters
@benwtrent
benwtrent / ByteBufferFloatDecodeLatencyBenchmark.java
Last active June 8, 2023 14:51
Decoding ByteBuffers into Floats microbenchmark
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
@benwtrent
benwtrent / helpers.py
Created January 31, 2023 16:09
Some helpers around loading and testing BEIR data sets with elasticsearch
import loaders
from sentence_transformers import SentenceTransformer
# Load queries, qrels, etc. and create embeddings for the queries
queries = loaders.load_jsonl(jsonl_path=Path("./data/queries.jsonl"))
embedding_model = SentenceTransformer(model_id, device="mps")
query_embeddings = embedding_model.encode([d['text'] for d in queries])
query_embeddings = query_embeddings.tolist()
query_and_embeddings = [dict(item, **{'embedding': embedding}) for (item, embedding) in zip(queries, query_embeddings)]
qrels = loaders.load_beir_qrels(qrels_file=Path("./data/qrels/test.tsv"))
@benwtrent
benwtrent / rallyrun0.txt
Last active November 18, 2022 23:42
Rally runs with new Lucene build
------------------------------------------------------
_______ __ _____
/ ____(_)___ ____ _/ / / ___/_________ ________
/ /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \
/ __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/
/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/
------------------------------------------------------
| Metric | Task | Baseline | Contender | Diff | Unit | Diff % |
|--------------------------------------------------------------:|---------------------------------------------:|---------------:|---------------:|-------------:|-------:|---------:|
package org.apache.pylucene.codecs;
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
public class PyLucene95Codec extends Lucene95Codec {
private long pythonObject;
public void pythonExtension(long pythonObject){
this.pythonObject = pythonObject;
@benwtrent
benwtrent / PyLucene94Codec.java
Created November 10, 2022 14:31
PyLucene94Codec extension
package org.apache.pylucene.codecs;
import org.apache.lucene.codecs.lucene94.Lucene94Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
public class PyLucene94Codec extends Lucene94Codec {
private long pythonObject;
public void pythonExtension(long pythonObject){
this.pythonObject = pythonObject;
@benwtrent
benwtrent / lucenepyknn.py
Created October 18, 2022 19:27
Ann Benchmark's integration using Lucene KNN.
"""
ann-benchmarks interface for Apache Lucene.
"""
import sklearn.preprocessing
import numpy as np
from struct import Struct
import lucene
@benwtrent
benwtrent / EntropyBenchmark.java
Created November 10, 2020 16:39
Benchmark for calculating the entropy for a string.
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;