Skip to content

Instantly share code, notes, and snippets.

@benwtrent
Last active January 31, 2024 15:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benwtrent/20edc9e1731def4d0e117ed1e0b52ef9 to your computer and use it in GitHub Desktop.
Save benwtrent/20edc9e1731def4d0e117ed1e0b52ef9 to your computer and use it in GitHub Desktop.
download and format cohere data
import pyarrow.parquet as pq
import numpy as np
DATA_SETS =[
{"name": "wiki768", "files": [
"train-00000-of-00004-1a1932c9ca1c7152.parquet",
"train-00001-of-00004-f4a4f5540ade14b4.parquet",
"train-00002-of-00004-ff770df3ab420d14.parquet",
"train-00003-of-00004-85b3dbbc960e92ec.parquet",
]},
# {"name": "wiki768en", "files": [
# "0-en.parquet",
# "1-en.parquet",
# "2-en.parquet",
# "3-en.parquet",
# ]},
# {"name": "wiki768ja", "files": [
# "0-ja.parquet",
# "1-ja.parquet",
# "2-ja.parquet",
# "3-ja.parquet",
# ]},
# {"name": "wiki768de", "files": [
# "0-de.parquet",
# "1-de.parquet",
# "2-de.parquet",
# "3-de.parquet",
# ]},
]
def transform_queries(Q):
n, _ = Q.shape
return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32)
def transform_docs(D, norms):
n, d = D.shape
max_norm = magnitudes.max()
flipped_norms = np.copy(norms).reshape(n, 1)
transformed_data = np.concatenate([D, np.sqrt(max_norm**2 - flipped_norms**2)], axis=-1, dtype=np.float32)
return transformed_data
def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto):
assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different"
def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto):
n1, d1 = arr1.shape
n2, d2 = arr2.shape
assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]"
for i in range(n1):
validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto)
dataset = DATA_SETS[0]
name = dataset["name"]
tb1 = pq.read_table(dataset["files"][0], columns=['emb'])
tb2 = pq.read_table(dataset["files"][1], columns=['emb'])
tb3 = pq.read_table(dataset["files"][2], columns=['emb'])
tb4 = pq.read_table(dataset["files"][3], columns=['emb'])
np1 = tb1[0].to_numpy()
np2 = tb2[0].to_numpy()
np3 = tb3[0].to_numpy()
np4 = tb4[0].to_numpy()
np_total = np.concatenate((np1, np2, np3, np4))
#Have to convert to a list here to get
#the numpy ndarray's shape correct later
#There's probably a better way...
flat_ds = list()
for vec in np_total:
flat_ds.append(vec)
np_flat_ds = np.array(flat_ds)
# we want to mix ja and en
np.random.shuffle(np_flat_ds)
row_count = np_flat_ds.shape[0]
query_count = 10_000
training_rows = row_count - query_count
print(f"{name} num rows: {training_rows}")
with open(f"{name}.test", "wb") as out_f:
np_flat_ds[training_rows:-1].tofile(out_f)
# write the same thing but with each vector normalized by their magnitude
with open(f"{name}.test.norm", "wb") as out_f:
magnitudes = np.linalg.norm(np_flat_ds[training_rows:-1], axis=1)
np_flat_ds[training_rows:-1] = np_flat_ds[training_rows:-1] / magnitudes[:, np.newaxis]
# normalize the vectors
print(f"{np_flat_ds[training_rows]}")
# print magnitude
print(f"{np.linalg.norm(np_flat_ds[training_rows])}")
np_flat_ds[training_rows:-1].tofile(out_f)
magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1)
indices = np.argsort(magnitudes)
np_flat_ds_sorted = np_flat_ds[indices]
with open(f"{name}.train", "wb") as out_f:
np_flat_ds[0:training_rows].tofile(out_f)
with open(f"{name}.ordered.train", "w") as out_f:
np_flat_ds_sorted.tofile(out_f)
# write the same thing but with each vector normalized by their magnitude
with open(f"{name}.train.norm", "wb") as out_f:
magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1)
# normalize the vectors
np_flat_ds[0:training_rows] = np_flat_ds[0:training_rows] / magnitudes[:, np.newaxis]
print(f"{np_flat_ds[0]}")
# print magnitude
print(f"{np.linalg.norm(np_flat_ds[0])}")
np_flat_ds[0:training_rows].tofile(out_f)
#!/bin/sh
# simple
curl -LO https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings/resolve/main/data/train-00000-of-00004-1a1932c9ca1c7152.parquet
curl -LO https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings/resolve/main/data/train-00001-of-00004-f4a4f5540ade14b4.parquet
curl -LO https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings/resolve/main/data/train-00002-of-00004-ff770df3ab420d14.parquet
curl -LO https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings/resolve/main/data/train-00003-of-00004-85b3dbbc960e92ec.parquet
# japanese
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/0.parquet -o 0-ja.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/1.parquet -o 1-ja.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/33.parquet -o 2-ja.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-ja-embeddings/parquet/Cohere--wikipedia-22-12-ja-embeddings/train/34.parquet -o 3-ja.parquet
#
##english
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/0.parquet -o 0-en.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/1.parquet -o 1-en.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/251.parquet -o 2-en.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-en-embeddings/parquet/Cohere--wikipedia-22-12-en-embeddings/train/252.parquet -o 3-en.parquet
#
## german
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-de-embeddings/parquet/Cohere--wikipedia-22-12-de-embeddings/train/0.parquet -o 0-de.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-de-embeddings/parquet/Cohere--wikipedia-22-12-de-embeddings/train/1.parquet -o 1-de.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-de-embeddings/parquet/Cohere--wikipedia-22-12-de-embeddings/train/106.parquet -o 2-de.parquet
#curl -L https://huggingface.co/api/datasets/Cohere/wikipedia-22-12-de-embeddings/parquet/Cohere--wikipedia-22-12-de-embeddings/train/107.parquet -o 3-de.parquet
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment