Skip to content

Instantly share code, notes, and snippets.

@benwtrent
Created February 16, 2024 16:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benwtrent/db37f3cae9732975c7df6d5451dbc475 to your computer and use it in GitHub Desktop.
Save benwtrent/db37f3cae9732975c7df6d5451dbc475 to your computer and use it in GitHub Desktop.
Stupid binary encoding tests
import numpy as np
import pyarrow.parquet as pq
from sklearn.neighbors import NearestNeighbors
# load data/%d-en.parquet files into a single numpy metrix
# vector dimensions are 1024
# load data
tbls = []
for i in range(10):
tbls.append(pq.read_table('data/%d-en.parquet' % i, columns=['emb']))
np_total = np.concatenate([tbl[0].to_numpy() for tbl in tbls])
flat_ds = list()
for vec in np_total:
flat_ds.append(vec)
np_flat_ds = np.array(flat_ds)
np_flat_ds.shape
doc_vectors = np_flat_ds[:-1000]
query_vectors = np_flat_ds[-1000:]
# get true 100 nearest neighbors for each query vector
knn = NearestNeighbors(n_neighbors=100, metric='cosine')
knn.fit(doc_vectors)
true_scores, true_neighbors = knn.kneighbors(query_vectors, return_distance=True)
# now binary quantize every vector, if dimension is > 0, set to 1 else 0
# this is a very simple and fast way to reduce the size of the vectors
# and the number of operations needed to compute the distance
binary_doc_vectors = (doc_vectors > 0).astype('uint8')
binary_query_vectors = (query_vectors > 0).astype('uint8')
knn = NearestNeighbors(n_neighbors=100, metric='hamming')
knn.fit(binary_doc_vectors)
hamming_neighbors = knn.kneighbors(binary_query_vectors, return_distance=False)
# calculate the overlap between the true 100 nearest neighbors and the hamming nearest neighbors
# this is the recall of the method
overlap = np.array([len(np.intersect1d(true, hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)])
#print('overlap', overlap)
# divide by 100 to get the recall
overlap = overlap / 100
print('average overlap', overlap.mean())
# now calculate the recall@50 hamming, i.e. how much overlap is there with the 100 nearest hamming distance to the 50 nearest true neighbors
# this should much much higher than the average overlap
overlap = np.array([len(np.intersect1d(true[:10], hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)])
# divide by 50 to get the recall
overlap = overlap / 10
print('overlap', overlap.mean())
#!/bin/sh
base_url="https://huggingface.co/api/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/parquet/en/train/"
for i in {0..10}
do
url="${base_url}${i}.parquet"
output_file="${i}-en.parquet"
echo "Downloading: $url"
curl -L "$url" -o "$output_file" &
done
wait
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment