Skip to content

Instantly share code, notes, and snippets.

@kemingy
Last active December 4, 2024 10:37
Show Gist options
  • Save kemingy/2f503fcfff86b9e0197e975c02359157 to your computer and use it in GitHub Desktop.
Save kemingy/2f503fcfff86b9e0197e975c02359157 to your computer and use it in GitHub Desktop.

vecs file format

A binary format to store the vectors.

| dim (4 bytes) | vector (4 * dim bytes) |
| dim (4 bytes) | vector (4 * dim bytes) |
...
| dim (4 bytes) | vector (4 * dim bytes) |
from struct import unpack, pack
import numpy as np
def read_vec(filepath: str, vec_type: np.dtype = np.float32):
"""Read vectors from a file. Support `fvecs`, `ivecs` and `bvecs` format.
Args:
filepath: The path of the file.
vec_type: The type of the vectors.
"""
size = np.dtype(vec_type).itemsize
with open(filepath, "rb") as f:
vecs = []
while True:
try:
buf = f.read(4)
if len(buf) == 0:
break
dim = unpack("<i", buf)[0]
vecs.append(np.frombuffer(f.read(dim * size), dtype=vec_type))
except Exception as err:
print(err)
break
return np.array(vecs)
def write_vec(filepath: str, vecs: np.ndarray, vec_type: np.dtype = np.float32):
"""Write vectors to a file. Support `fvecs`, `ivecs` and `bvecs` format."""
with open(filepath, "wb") as f:
for vec in vecs:
f.write(pack("<i", len(vec)))
f.write(vec.tobytes())
use std::fs::File;
use std::io::{BufReader, BufWriter, Read, Write};
use std::path::Path;
use num_traits::{FromBytes, ToBytes};
/// Read the fvces/ivces file.
pub fn read_vecs<T>(path: &Path) -> std::io::Result<Vec<Vec<T>>>
where
T: Sized + FromBytes<Bytes = [u8; 4]>,
{
let file = File::open(path)?;
let mut reader = BufReader::new(file);
let mut buf = [0u8; 4];
let mut count: usize;
let mut vecs = Vec::new();
loop {
count = reader.read(&mut buf)?;
if count == 0 {
break;
}
let dim = u32::from_le_bytes(buf) as usize;
let mut vec = Vec::with_capacity(dim);
for _ in 0..dim {
reader.read_exact(&mut buf)?;
vec.push(T::from_le_bytes(&buf));
}
vecs.push(vec);
}
Ok(vecs)
}
/// Write the fvecs/ivecs file.
pub fn write_vecs<T>(path: &Path, vecs: &[impl AsRef<[T]>]) -> std::io::Result<()>
where
T: Sized + ToBytes,
{
let file = File::create(path)?;
let mut writer = BufWriter::new(file);
for vec in vecs.iter() {
writer.write_all(&(vec.as_ref().len() as u32).to_le_bytes())?;
for v in vec.as_ref().iter() {
writer.write_all(T::to_le_bytes(v).as_ref())?;
}
}
writer.flush()?;
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment