benwtrent/data_load_and_encode.py

## data_load_and_encode.py
import numpy as np
import pyarrow.parquet as pq
from sklearn.neighbors import NearestNeighbors

# load data/%d-en.parquet files into a single numpy metrix
# vector dimensions are 1024

# load data
tbls = []
for i in range(10):
    tbls.append(pq.read_table('data/%d-en.parquet' % i, columns=['emb']))
np_total = np.concatenate([tbl[0].to_numpy() for tbl in tbls])
flat_ds = list()
for vec in np_total:
    flat_ds.append(vec)
np_flat_ds = np.array(flat_ds)
np_flat_ds.shape
doc_vectors = np_flat_ds[:-1000]
query_vectors = np_flat_ds[-1000:]

# get true 100 nearest neighbors for each query vector
knn = NearestNeighbors(n_neighbors=100, metric='cosine')
knn.fit(doc_vectors)
true_scores, true_neighbors = knn.kneighbors(query_vectors, return_distance=True)
# now binary quantize every vector, if dimension is > 0, set to 1 else 0
# this is a very simple and fast way to reduce the size of the vectors
# and the number of operations needed to compute the distance
binary_doc_vectors = (doc_vectors > 0).astype('uint8')
binary_query_vectors = (query_vectors > 0).astype('uint8')


knn = NearestNeighbors(n_neighbors=100, metric='hamming')
knn.fit(binary_doc_vectors)
hamming_neighbors = knn.kneighbors(binary_query_vectors, return_distance=False)

# calculate the overlap between the true 100 nearest neighbors and the hamming nearest neighbors
# this is the recall of the method
overlap = np.array([len(np.intersect1d(true, hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)])
#print('overlap', overlap)
# divide by 100 to get the recall
overlap = overlap / 100
print('average overlap', overlap.mean())
# now calculate the recall@50 hamming, i.e. how much overlap is there with the 100 nearest hamming distance to the 50 nearest true neighbors
# this should much much higher than the average overlap
overlap = np.array([len(np.intersect1d(true[:10], hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)])
# divide by 50 to get the recall
overlap = overlap / 10
print('overlap', overlap.mean())

## download_data.sh
#!/bin/sh
base_url="https://huggingface.co/api/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/parquet/en/train/"

for i in {0..10}
do
    url="${base_url}${i}.parquet"
    output_file="${i}-en.parquet"
    echo "Downloading: $url"
    curl -L "$url" -o "$output_file" &
done
wait
	import numpy as np
	import pyarrow.parquet as pq
	from sklearn.neighbors import NearestNeighbors

	# load data/%d-en.parquet files into a single numpy metrix
	# vector dimensions are 1024

	# load data
	tbls = []
	for i in range(10):
	tbls.append(pq.read_table('data/%d-en.parquet' % i, columns=['emb']))
	np_total = np.concatenate([tbl[0].to_numpy() for tbl in tbls])
	flat_ds = list()
	for vec in np_total:
	flat_ds.append(vec)
	np_flat_ds = np.array(flat_ds)
	np_flat_ds.shape
	doc_vectors = np_flat_ds[:-1000]
	query_vectors = np_flat_ds[-1000:]

	# get true 100 nearest neighbors for each query vector
	knn = NearestNeighbors(n_neighbors=100, metric='cosine')
	knn.fit(doc_vectors)
	true_scores, true_neighbors = knn.kneighbors(query_vectors, return_distance=True)
	# now binary quantize every vector, if dimension is > 0, set to 1 else 0
	# this is a very simple and fast way to reduce the size of the vectors
	# and the number of operations needed to compute the distance
	binary_doc_vectors = (doc_vectors > 0).astype('uint8')
	binary_query_vectors = (query_vectors > 0).astype('uint8')


	knn = NearestNeighbors(n_neighbors=100, metric='hamming')
	knn.fit(binary_doc_vectors)
	hamming_neighbors = knn.kneighbors(binary_query_vectors, return_distance=False)

	# calculate the overlap between the true 100 nearest neighbors and the hamming nearest neighbors
	# this is the recall of the method
	overlap = np.array([len(np.intersect1d(true, hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)])
	#print('overlap', overlap)
	# divide by 100 to get the recall
	overlap = overlap / 100
	print('average overlap', overlap.mean())
	# now calculate the recall@50 hamming, i.e. how much overlap is there with the 100 nearest hamming distance to the 50 nearest true neighbors
	# this should much much higher than the average overlap
	overlap = np.array([len(np.intersect1d(true[:10], hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)])
	# divide by 50 to get the recall
	overlap = overlap / 10
	print('overlap', overlap.mean())
	#!/bin/sh
	base_url="https://huggingface.co/api/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/parquet/en/train/"

	for i in {0..10}
	do
	url="${base_url}${i}.parquet"
	output_file="${i}-en.parquet"
	echo "Downloading: $url"
	curl -L "$url" -o "$output_file" &
	done
	wait